]>
Commit | Line | Data |
---|---|---|
34dc7c2f BB |
1 | /* |
2 | * CDDL HEADER START | |
3 | * | |
4 | * The contents of this file are subject to the terms of the | |
5 | * Common Development and Distribution License (the "License"). | |
6 | * You may not use this file except in compliance with the License. | |
7 | * | |
8 | * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE | |
9 | * or http://www.opensolaris.org/os/licensing. | |
10 | * See the License for the specific language governing permissions | |
11 | * and limitations under the License. | |
12 | * | |
13 | * When distributing Covered Code, include this CDDL HEADER in each | |
14 | * file and include the License file at usr/src/OPENSOLARIS.LICENSE. | |
15 | * If applicable, add the following below this CDDL HEADER, with the | |
16 | * fields enclosed by brackets "[]" replaced with your own identifying | |
17 | * information: Portions Copyright [yyyy] [name of copyright owner] | |
18 | * | |
19 | * CDDL HEADER END | |
20 | */ | |
21 | /* | |
d164b209 | 22 | * Copyright 2009 Sun Microsystems, Inc. All rights reserved. |
34dc7c2f BB |
23 | * Use is subject to license terms. |
24 | */ | |
2e528b49 | 25 | /* |
a6255b7f | 26 | * Copyright (c) 2013, 2016 by Delphix. All rights reserved. |
2e528b49 | 27 | */ |
34dc7c2f | 28 | |
34dc7c2f BB |
29 | #include <sys/zfs_context.h> |
30 | #include <sys/spa.h> | |
31 | #include <sys/vdev_impl.h> | |
32 | #include <sys/zio.h> | |
33 | #include <sys/kstat.h> | |
a6255b7f | 34 | #include <sys/abd.h> |
34dc7c2f BB |
35 | |
36 | /* | |
37 | * Virtual device read-ahead caching. | |
38 | * | |
39 | * This file implements a simple LRU read-ahead cache. When the DMU reads | |
40 | * a given block, it will often want other, nearby blocks soon thereafter. | |
41 | * We take advantage of this by reading a larger disk region and caching | |
42 | * the result. In the best case, this can turn 128 back-to-back 512-byte | |
43 | * reads into a single 64k read followed by 127 cache hits; this reduces | |
44 | * latency dramatically. In the worst case, it can turn an isolated 512-byte | |
45 | * read into a 64k read, which doesn't affect latency all that much but is | |
46 | * terribly wasteful of bandwidth. A more intelligent version of the cache | |
47 | * could keep track of access patterns and not do read-ahead unless it sees | |
48 | * at least two temporally close I/Os to the same region. Currently, only | |
49 | * metadata I/O is inflated. A futher enhancement could take advantage of | |
50 | * more semantic information about the I/O. And it could use something | |
51 | * faster than an AVL tree; that was chosen solely for convenience. | |
52 | * | |
53 | * There are five cache operations: allocate, fill, read, write, evict. | |
54 | * | |
55 | * (1) Allocate. This reserves a cache entry for the specified region. | |
56 | * We separate the allocate and fill operations so that multiple threads | |
57 | * don't generate I/O for the same cache miss. | |
58 | * | |
59 | * (2) Fill. When the I/O for a cache miss completes, the fill routine | |
60 | * places the data in the previously allocated cache entry. | |
61 | * | |
62 | * (3) Read. Read data from the cache. | |
63 | * | |
64 | * (4) Write. Update cache contents after write completion. | |
65 | * | |
66 | * (5) Evict. When allocating a new entry, we evict the oldest (LRU) entry | |
67 | * if the total cache size exceeds zfs_vdev_cache_size. | |
68 | */ | |
69 | ||
70 | /* | |
71 | * These tunables are for performance analysis. | |
72 | */ | |
73 | /* | |
74 | * All i/os smaller than zfs_vdev_cache_max will be turned into | |
75 | * 1<<zfs_vdev_cache_bshift byte reads by the vdev_cache (aka software | |
76 | * track buffer). At most zfs_vdev_cache_size bytes will be kept in each | |
77 | * vdev's vdev_cache. | |
2cc6c8db GA |
78 | * |
79 | * TODO: Note that with the current ZFS code, it turns out that the | |
80 | * vdev cache is not helpful, and in some cases actually harmful. It | |
81 | * is better if we disable this. Once some time has passed, we should | |
82 | * actually remove this to simplify the code. For now we just disable | |
83 | * it by setting the zfs_vdev_cache_size to zero. Note that Solaris 11 | |
84 | * has made these same changes. | |
34dc7c2f BB |
85 | */ |
86 | int zfs_vdev_cache_max = 1<<14; /* 16KB */ | |
2cc6c8db | 87 | int zfs_vdev_cache_size = 0; |
34dc7c2f BB |
88 | int zfs_vdev_cache_bshift = 16; |
89 | ||
90 | #define VCBS (1 << zfs_vdev_cache_bshift) /* 64KB */ | |
91 | ||
92 | kstat_t *vdc_ksp = NULL; | |
93 | ||
94 | typedef struct vdc_stats { | |
95 | kstat_named_t vdc_stat_delegations; | |
96 | kstat_named_t vdc_stat_hits; | |
97 | kstat_named_t vdc_stat_misses; | |
98 | } vdc_stats_t; | |
99 | ||
100 | static vdc_stats_t vdc_stats = { | |
101 | { "delegations", KSTAT_DATA_UINT64 }, | |
102 | { "hits", KSTAT_DATA_UINT64 }, | |
103 | { "misses", KSTAT_DATA_UINT64 } | |
104 | }; | |
105 | ||
bc89ac84 | 106 | #define VDCSTAT_BUMP(stat) atomic_inc_64(&vdc_stats.stat.value.ui64); |
34dc7c2f | 107 | |
ee36c709 | 108 | static inline int |
34dc7c2f BB |
109 | vdev_cache_offset_compare(const void *a1, const void *a2) |
110 | { | |
ee36c709 GN |
111 | const vdev_cache_entry_t *ve1 = (const vdev_cache_entry_t *)a1; |
112 | const vdev_cache_entry_t *ve2 = (const vdev_cache_entry_t *)a2; | |
34dc7c2f | 113 | |
ee36c709 | 114 | return (AVL_CMP(ve1->ve_offset, ve2->ve_offset)); |
34dc7c2f BB |
115 | } |
116 | ||
117 | static int | |
118 | vdev_cache_lastused_compare(const void *a1, const void *a2) | |
119 | { | |
ee36c709 GN |
120 | const vdev_cache_entry_t *ve1 = (const vdev_cache_entry_t *)a1; |
121 | const vdev_cache_entry_t *ve2 = (const vdev_cache_entry_t *)a2; | |
34dc7c2f | 122 | |
ee36c709 GN |
123 | int cmp = AVL_CMP(ve1->ve_lastused, ve2->ve_lastused); |
124 | if (likely(cmp)) | |
125 | return (cmp); | |
34dc7c2f BB |
126 | |
127 | /* | |
128 | * Among equally old entries, sort by offset to ensure uniqueness. | |
129 | */ | |
130 | return (vdev_cache_offset_compare(a1, a2)); | |
131 | } | |
132 | ||
133 | /* | |
134 | * Evict the specified entry from the cache. | |
135 | */ | |
136 | static void | |
137 | vdev_cache_evict(vdev_cache_t *vc, vdev_cache_entry_t *ve) | |
138 | { | |
139 | ASSERT(MUTEX_HELD(&vc->vc_lock)); | |
a6255b7f DQ |
140 | ASSERT3P(ve->ve_fill_io, ==, NULL); |
141 | ASSERT3P(ve->ve_abd, !=, NULL); | |
34dc7c2f | 142 | |
34dc7c2f BB |
143 | avl_remove(&vc->vc_lastused_tree, ve); |
144 | avl_remove(&vc->vc_offset_tree, ve); | |
a6255b7f | 145 | abd_free(ve->ve_abd); |
34dc7c2f BB |
146 | kmem_free(ve, sizeof (vdev_cache_entry_t)); |
147 | } | |
148 | ||
149 | /* | |
150 | * Allocate an entry in the cache. At the point we don't have the data, | |
151 | * we're just creating a placeholder so that multiple threads don't all | |
152 | * go off and read the same blocks. | |
153 | */ | |
154 | static vdev_cache_entry_t * | |
155 | vdev_cache_allocate(zio_t *zio) | |
156 | { | |
157 | vdev_cache_t *vc = &zio->io_vd->vdev_cache; | |
158 | uint64_t offset = P2ALIGN(zio->io_offset, VCBS); | |
159 | vdev_cache_entry_t *ve; | |
160 | ||
161 | ASSERT(MUTEX_HELD(&vc->vc_lock)); | |
162 | ||
163 | if (zfs_vdev_cache_size == 0) | |
164 | return (NULL); | |
165 | ||
166 | /* | |
167 | * If adding a new entry would exceed the cache size, | |
168 | * evict the oldest entry (LRU). | |
169 | */ | |
170 | if ((avl_numnodes(&vc->vc_lastused_tree) << zfs_vdev_cache_bshift) > | |
171 | zfs_vdev_cache_size) { | |
172 | ve = avl_first(&vc->vc_lastused_tree); | |
b128c09f | 173 | if (ve->ve_fill_io != NULL) |
34dc7c2f | 174 | return (NULL); |
a6255b7f | 175 | ASSERT3U(ve->ve_hits, !=, 0); |
34dc7c2f BB |
176 | vdev_cache_evict(vc, ve); |
177 | } | |
178 | ||
79c76d5b | 179 | ve = kmem_zalloc(sizeof (vdev_cache_entry_t), KM_SLEEP); |
34dc7c2f | 180 | ve->ve_offset = offset; |
428870ff | 181 | ve->ve_lastused = ddi_get_lbolt(); |
a6255b7f | 182 | ve->ve_abd = abd_alloc_for_io(VCBS, B_TRUE); |
34dc7c2f BB |
183 | |
184 | avl_add(&vc->vc_offset_tree, ve); | |
185 | avl_add(&vc->vc_lastused_tree, ve); | |
186 | ||
187 | return (ve); | |
188 | } | |
189 | ||
190 | static void | |
191 | vdev_cache_hit(vdev_cache_t *vc, vdev_cache_entry_t *ve, zio_t *zio) | |
192 | { | |
193 | uint64_t cache_phase = P2PHASE(zio->io_offset, VCBS); | |
194 | ||
195 | ASSERT(MUTEX_HELD(&vc->vc_lock)); | |
a6255b7f | 196 | ASSERT3P(ve->ve_fill_io, ==, NULL); |
34dc7c2f | 197 | |
428870ff | 198 | if (ve->ve_lastused != ddi_get_lbolt()) { |
34dc7c2f | 199 | avl_remove(&vc->vc_lastused_tree, ve); |
428870ff | 200 | ve->ve_lastused = ddi_get_lbolt(); |
34dc7c2f BB |
201 | avl_add(&vc->vc_lastused_tree, ve); |
202 | } | |
203 | ||
204 | ve->ve_hits++; | |
a6255b7f | 205 | abd_copy_off(zio->io_abd, ve->ve_abd, 0, cache_phase, zio->io_size); |
34dc7c2f BB |
206 | } |
207 | ||
208 | /* | |
209 | * Fill a previously allocated cache entry with data. | |
210 | */ | |
211 | static void | |
d164b209 | 212 | vdev_cache_fill(zio_t *fio) |
34dc7c2f | 213 | { |
d164b209 | 214 | vdev_t *vd = fio->io_vd; |
34dc7c2f | 215 | vdev_cache_t *vc = &vd->vdev_cache; |
d164b209 BB |
216 | vdev_cache_entry_t *ve = fio->io_private; |
217 | zio_t *pio; | |
34dc7c2f | 218 | |
a6255b7f | 219 | ASSERT3U(fio->io_size, ==, VCBS); |
34dc7c2f BB |
220 | |
221 | /* | |
222 | * Add data to the cache. | |
223 | */ | |
224 | mutex_enter(&vc->vc_lock); | |
225 | ||
a6255b7f DQ |
226 | ASSERT3P(ve->ve_fill_io, ==, fio); |
227 | ASSERT3U(ve->ve_offset, ==, fio->io_offset); | |
228 | ASSERT3P(ve->ve_abd, ==, fio->io_abd); | |
34dc7c2f BB |
229 | |
230 | ve->ve_fill_io = NULL; | |
231 | ||
232 | /* | |
233 | * Even if this cache line was invalidated by a missed write update, | |
234 | * any reads that were queued up before the missed update are still | |
235 | * valid, so we can satisfy them from this line before we evict it. | |
236 | */ | |
1c27024e | 237 | zio_link_t *zl = NULL; |
3dfb57a3 | 238 | while ((pio = zio_walk_parents(fio, &zl)) != NULL) |
d164b209 | 239 | vdev_cache_hit(vc, ve, pio); |
34dc7c2f | 240 | |
d164b209 | 241 | if (fio->io_error || ve->ve_missed_update) |
34dc7c2f BB |
242 | vdev_cache_evict(vc, ve); |
243 | ||
244 | mutex_exit(&vc->vc_lock); | |
34dc7c2f BB |
245 | } |
246 | ||
247 | /* | |
b0bc7a84 | 248 | * Read data from the cache. Returns B_TRUE cache hit, B_FALSE on miss. |
34dc7c2f | 249 | */ |
b0bc7a84 | 250 | boolean_t |
34dc7c2f BB |
251 | vdev_cache_read(zio_t *zio) |
252 | { | |
253 | vdev_cache_t *vc = &zio->io_vd->vdev_cache; | |
5fed499d | 254 | vdev_cache_entry_t *ve, *ve_search; |
34dc7c2f | 255 | uint64_t cache_offset = P2ALIGN(zio->io_offset, VCBS); |
34dc7c2f | 256 | zio_t *fio; |
d1d7e268 | 257 | ASSERTV(uint64_t cache_phase = P2PHASE(zio->io_offset, VCBS)); |
34dc7c2f | 258 | |
a6255b7f | 259 | ASSERT3U(zio->io_type, ==, ZIO_TYPE_READ); |
34dc7c2f BB |
260 | |
261 | if (zio->io_flags & ZIO_FLAG_DONT_CACHE) | |
b0bc7a84 | 262 | return (B_FALSE); |
34dc7c2f BB |
263 | |
264 | if (zio->io_size > zfs_vdev_cache_max) | |
b0bc7a84 | 265 | return (B_FALSE); |
34dc7c2f BB |
266 | |
267 | /* | |
268 | * If the I/O straddles two or more cache blocks, don't cache it. | |
269 | */ | |
b128c09f | 270 | if (P2BOUNDARY(zio->io_offset, zio->io_size, VCBS)) |
b0bc7a84 | 271 | return (B_FALSE); |
34dc7c2f | 272 | |
a6255b7f | 273 | ASSERT3U(cache_phase + zio->io_size, <=, VCBS); |
34dc7c2f BB |
274 | |
275 | mutex_enter(&vc->vc_lock); | |
276 | ||
79c76d5b | 277 | ve_search = kmem_alloc(sizeof (vdev_cache_entry_t), KM_SLEEP); |
5fed499d BB |
278 | ve_search->ve_offset = cache_offset; |
279 | ve = avl_find(&vc->vc_offset_tree, ve_search, NULL); | |
d1d7e268 | 280 | kmem_free(ve_search, sizeof (vdev_cache_entry_t)); |
34dc7c2f BB |
281 | |
282 | if (ve != NULL) { | |
283 | if (ve->ve_missed_update) { | |
284 | mutex_exit(&vc->vc_lock); | |
b0bc7a84 | 285 | return (B_FALSE); |
34dc7c2f BB |
286 | } |
287 | ||
288 | if ((fio = ve->ve_fill_io) != NULL) { | |
34dc7c2f | 289 | zio_vdev_io_bypass(zio); |
d164b209 | 290 | zio_add_child(zio, fio); |
34dc7c2f BB |
291 | mutex_exit(&vc->vc_lock); |
292 | VDCSTAT_BUMP(vdc_stat_delegations); | |
b0bc7a84 | 293 | return (B_TRUE); |
34dc7c2f BB |
294 | } |
295 | ||
296 | vdev_cache_hit(vc, ve, zio); | |
297 | zio_vdev_io_bypass(zio); | |
298 | ||
299 | mutex_exit(&vc->vc_lock); | |
34dc7c2f | 300 | VDCSTAT_BUMP(vdc_stat_hits); |
b0bc7a84 | 301 | return (B_TRUE); |
34dc7c2f BB |
302 | } |
303 | ||
304 | ve = vdev_cache_allocate(zio); | |
305 | ||
306 | if (ve == NULL) { | |
307 | mutex_exit(&vc->vc_lock); | |
b0bc7a84 | 308 | return (B_FALSE); |
34dc7c2f BB |
309 | } |
310 | ||
b128c09f | 311 | fio = zio_vdev_delegated_io(zio->io_vd, cache_offset, |
a6255b7f | 312 | ve->ve_abd, VCBS, ZIO_TYPE_READ, ZIO_PRIORITY_NOW, |
b128c09f | 313 | ZIO_FLAG_DONT_CACHE, vdev_cache_fill, ve); |
34dc7c2f BB |
314 | |
315 | ve->ve_fill_io = fio; | |
34dc7c2f | 316 | zio_vdev_io_bypass(zio); |
d164b209 | 317 | zio_add_child(zio, fio); |
34dc7c2f BB |
318 | |
319 | mutex_exit(&vc->vc_lock); | |
320 | zio_nowait(fio); | |
321 | VDCSTAT_BUMP(vdc_stat_misses); | |
322 | ||
b0bc7a84 | 323 | return (B_TRUE); |
34dc7c2f BB |
324 | } |
325 | ||
326 | /* | |
327 | * Update cache contents upon write completion. | |
328 | */ | |
329 | void | |
330 | vdev_cache_write(zio_t *zio) | |
331 | { | |
332 | vdev_cache_t *vc = &zio->io_vd->vdev_cache; | |
333 | vdev_cache_entry_t *ve, ve_search; | |
334 | uint64_t io_start = zio->io_offset; | |
335 | uint64_t io_end = io_start + zio->io_size; | |
336 | uint64_t min_offset = P2ALIGN(io_start, VCBS); | |
337 | uint64_t max_offset = P2ROUNDUP(io_end, VCBS); | |
338 | avl_index_t where; | |
339 | ||
a6255b7f | 340 | ASSERT3U(zio->io_type, ==, ZIO_TYPE_WRITE); |
34dc7c2f BB |
341 | |
342 | mutex_enter(&vc->vc_lock); | |
343 | ||
344 | ve_search.ve_offset = min_offset; | |
345 | ve = avl_find(&vc->vc_offset_tree, &ve_search, &where); | |
346 | ||
347 | if (ve == NULL) | |
348 | ve = avl_nearest(&vc->vc_offset_tree, where, AVL_AFTER); | |
349 | ||
350 | while (ve != NULL && ve->ve_offset < max_offset) { | |
351 | uint64_t start = MAX(ve->ve_offset, io_start); | |
352 | uint64_t end = MIN(ve->ve_offset + VCBS, io_end); | |
353 | ||
354 | if (ve->ve_fill_io != NULL) { | |
355 | ve->ve_missed_update = 1; | |
356 | } else { | |
12aec7dc CC |
357 | abd_copy_off(ve->ve_abd, zio->io_abd, |
358 | start - ve->ve_offset, start - io_start, | |
359 | end - start); | |
34dc7c2f BB |
360 | } |
361 | ve = AVL_NEXT(&vc->vc_offset_tree, ve); | |
362 | } | |
363 | mutex_exit(&vc->vc_lock); | |
364 | } | |
365 | ||
366 | void | |
367 | vdev_cache_purge(vdev_t *vd) | |
368 | { | |
369 | vdev_cache_t *vc = &vd->vdev_cache; | |
370 | vdev_cache_entry_t *ve; | |
371 | ||
372 | mutex_enter(&vc->vc_lock); | |
373 | while ((ve = avl_first(&vc->vc_offset_tree)) != NULL) | |
374 | vdev_cache_evict(vc, ve); | |
375 | mutex_exit(&vc->vc_lock); | |
376 | } | |
377 | ||
378 | void | |
379 | vdev_cache_init(vdev_t *vd) | |
380 | { | |
381 | vdev_cache_t *vc = &vd->vdev_cache; | |
382 | ||
383 | mutex_init(&vc->vc_lock, NULL, MUTEX_DEFAULT, NULL); | |
384 | ||
385 | avl_create(&vc->vc_offset_tree, vdev_cache_offset_compare, | |
386 | sizeof (vdev_cache_entry_t), | |
387 | offsetof(struct vdev_cache_entry, ve_offset_node)); | |
388 | ||
389 | avl_create(&vc->vc_lastused_tree, vdev_cache_lastused_compare, | |
390 | sizeof (vdev_cache_entry_t), | |
391 | offsetof(struct vdev_cache_entry, ve_lastused_node)); | |
392 | } | |
393 | ||
394 | void | |
395 | vdev_cache_fini(vdev_t *vd) | |
396 | { | |
397 | vdev_cache_t *vc = &vd->vdev_cache; | |
398 | ||
399 | vdev_cache_purge(vd); | |
400 | ||
401 | avl_destroy(&vc->vc_offset_tree); | |
402 | avl_destroy(&vc->vc_lastused_tree); | |
403 | ||
404 | mutex_destroy(&vc->vc_lock); | |
405 | } | |
406 | ||
407 | void | |
408 | vdev_cache_stat_init(void) | |
409 | { | |
410 | vdc_ksp = kstat_create("zfs", 0, "vdev_cache_stats", "misc", | |
411 | KSTAT_TYPE_NAMED, sizeof (vdc_stats) / sizeof (kstat_named_t), | |
412 | KSTAT_FLAG_VIRTUAL); | |
413 | if (vdc_ksp != NULL) { | |
414 | vdc_ksp->ks_data = &vdc_stats; | |
415 | kstat_install(vdc_ksp); | |
416 | } | |
417 | } | |
418 | ||
419 | void | |
420 | vdev_cache_stat_fini(void) | |
421 | { | |
422 | if (vdc_ksp != NULL) { | |
423 | kstat_delete(vdc_ksp); | |
424 | vdc_ksp = NULL; | |
425 | } | |
426 | } | |
c409e464 BB |
427 | |
428 | #if defined(_KERNEL) && defined(HAVE_SPL) | |
429 | module_param(zfs_vdev_cache_max, int, 0644); | |
430 | MODULE_PARM_DESC(zfs_vdev_cache_max, "Inflate reads small than max"); | |
431 | ||
432 | module_param(zfs_vdev_cache_size, int, 0444); | |
433 | MODULE_PARM_DESC(zfs_vdev_cache_size, "Total size of the per-disk cache"); | |
434 | ||
435 | module_param(zfs_vdev_cache_bshift, int, 0644); | |
436 | MODULE_PARM_DESC(zfs_vdev_cache_bshift, "Shift size to inflate reads too"); | |
437 | #endif |