]> git.proxmox.com Git - mirror_zfs.git/blob - module/zfs/arc.c
Rebase master to b117
[mirror_zfs.git] / module / zfs / arc.c
1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /*
22 * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
23 * Use is subject to license terms.
24 */
25
26 /*
27 * DVA-based Adjustable Replacement Cache
28 *
29 * While much of the theory of operation used here is
30 * based on the self-tuning, low overhead replacement cache
31 * presented by Megiddo and Modha at FAST 2003, there are some
32 * significant differences:
33 *
34 * 1. The Megiddo and Modha model assumes any page is evictable.
35 * Pages in its cache cannot be "locked" into memory. This makes
36 * the eviction algorithm simple: evict the last page in the list.
37 * This also make the performance characteristics easy to reason
38 * about. Our cache is not so simple. At any given moment, some
39 * subset of the blocks in the cache are un-evictable because we
40 * have handed out a reference to them. Blocks are only evictable
41 * when there are no external references active. This makes
42 * eviction far more problematic: we choose to evict the evictable
43 * blocks that are the "lowest" in the list.
44 *
45 * There are times when it is not possible to evict the requested
46 * space. In these circumstances we are unable to adjust the cache
47 * size. To prevent the cache growing unbounded at these times we
48 * implement a "cache throttle" that slows the flow of new data
49 * into the cache until we can make space available.
50 *
51 * 2. The Megiddo and Modha model assumes a fixed cache size.
52 * Pages are evicted when the cache is full and there is a cache
53 * miss. Our model has a variable sized cache. It grows with
54 * high use, but also tries to react to memory pressure from the
55 * operating system: decreasing its size when system memory is
56 * tight.
57 *
58 * 3. The Megiddo and Modha model assumes a fixed page size. All
59 * elements of the cache are therefor exactly the same size. So
60 * when adjusting the cache size following a cache miss, its simply
61 * a matter of choosing a single page to evict. In our model, we
62 * have variable sized cache blocks (rangeing from 512 bytes to
63 * 128K bytes). We therefor choose a set of blocks to evict to make
64 * space for a cache miss that approximates as closely as possible
65 * the space used by the new block.
66 *
67 * See also: "ARC: A Self-Tuning, Low Overhead Replacement Cache"
68 * by N. Megiddo & D. Modha, FAST 2003
69 */
70
71 /*
72 * The locking model:
73 *
74 * A new reference to a cache buffer can be obtained in two
75 * ways: 1) via a hash table lookup using the DVA as a key,
76 * or 2) via one of the ARC lists. The arc_read() interface
77 * uses method 1, while the internal arc algorithms for
78 * adjusting the cache use method 2. We therefor provide two
79 * types of locks: 1) the hash table lock array, and 2) the
80 * arc list locks.
81 *
82 * Buffers do not have their own mutexs, rather they rely on the
83 * hash table mutexs for the bulk of their protection (i.e. most
84 * fields in the arc_buf_hdr_t are protected by these mutexs).
85 *
86 * buf_hash_find() returns the appropriate mutex (held) when it
87 * locates the requested buffer in the hash table. It returns
88 * NULL for the mutex if the buffer was not in the table.
89 *
90 * buf_hash_remove() expects the appropriate hash mutex to be
91 * already held before it is invoked.
92 *
93 * Each arc state also has a mutex which is used to protect the
94 * buffer list associated with the state. When attempting to
95 * obtain a hash table lock while holding an arc list lock you
96 * must use: mutex_tryenter() to avoid deadlock. Also note that
97 * the active state mutex must be held before the ghost state mutex.
98 *
99 * Arc buffers may have an associated eviction callback function.
100 * This function will be invoked prior to removing the buffer (e.g.
101 * in arc_do_user_evicts()). Note however that the data associated
102 * with the buffer may be evicted prior to the callback. The callback
103 * must be made with *no locks held* (to prevent deadlock). Additionally,
104 * the users of callbacks must ensure that their private data is
105 * protected from simultaneous callbacks from arc_buf_evict()
106 * and arc_do_user_evicts().
107 *
108 * Note that the majority of the performance stats are manipulated
109 * with atomic operations.
110 *
111 * The L2ARC uses the l2arc_buflist_mtx global mutex for the following:
112 *
113 * - L2ARC buflist creation
114 * - L2ARC buflist eviction
115 * - L2ARC write completion, which walks L2ARC buflists
116 * - ARC header destruction, as it removes from L2ARC buflists
117 * - ARC header release, as it removes from L2ARC buflists
118 */
119
120 #include <sys/spa.h>
121 #include <sys/zio.h>
122 #include <sys/zio_checksum.h>
123 #include <sys/zfs_context.h>
124 #include <sys/arc.h>
125 #include <sys/refcount.h>
126 #include <sys/vdev.h>
127 #include <sys/vdev_impl.h>
128 #ifdef _KERNEL
129 #include <sys/vmsystm.h>
130 #include <vm/anon.h>
131 #include <sys/fs/swapnode.h>
132 #include <sys/dnlc.h>
133 #endif
134 #include <sys/callb.h>
135 #include <sys/kstat.h>
136
137 static kmutex_t arc_reclaim_thr_lock;
138 static kcondvar_t arc_reclaim_thr_cv; /* used to signal reclaim thr */
139 static uint8_t arc_thread_exit;
140
141 extern int zfs_write_limit_shift;
142 extern uint64_t zfs_write_limit_max;
143 extern kmutex_t zfs_write_limit_lock;
144
145 #define ARC_REDUCE_DNLC_PERCENT 3
146 uint_t arc_reduce_dnlc_percent = ARC_REDUCE_DNLC_PERCENT;
147
148 typedef enum arc_reclaim_strategy {
149 ARC_RECLAIM_AGGR, /* Aggressive reclaim strategy */
150 ARC_RECLAIM_CONS /* Conservative reclaim strategy */
151 } arc_reclaim_strategy_t;
152
153 /* number of seconds before growing cache again */
154 static int arc_grow_retry = 60;
155
156 /* shift of arc_c for calculating both min and max arc_p */
157 static int arc_p_min_shift = 4;
158
159 /* log2(fraction of arc to reclaim) */
160 static int arc_shrink_shift = 5;
161
162 /*
163 * minimum lifespan of a prefetch block in clock ticks
164 * (initialized in arc_init())
165 */
166 static int arc_min_prefetch_lifespan;
167
168 static int arc_dead;
169
170 /*
171 * The arc has filled available memory and has now warmed up.
172 */
173 static boolean_t arc_warm;
174
175 /*
176 * These tunables are for performance analysis.
177 */
178 uint64_t zfs_arc_max;
179 uint64_t zfs_arc_min;
180 uint64_t zfs_arc_meta_limit = 0;
181 int zfs_mdcomp_disable = 0;
182 int zfs_arc_grow_retry = 0;
183 int zfs_arc_shrink_shift = 0;
184 int zfs_arc_p_min_shift = 0;
185
186 /*
187 * Note that buffers can be in one of 6 states:
188 * ARC_anon - anonymous (discussed below)
189 * ARC_mru - recently used, currently cached
190 * ARC_mru_ghost - recentely used, no longer in cache
191 * ARC_mfu - frequently used, currently cached
192 * ARC_mfu_ghost - frequently used, no longer in cache
193 * ARC_l2c_only - exists in L2ARC but not other states
194 * When there are no active references to the buffer, they are
195 * are linked onto a list in one of these arc states. These are
196 * the only buffers that can be evicted or deleted. Within each
197 * state there are multiple lists, one for meta-data and one for
198 * non-meta-data. Meta-data (indirect blocks, blocks of dnodes,
199 * etc.) is tracked separately so that it can be managed more
200 * explicitly: favored over data, limited explicitly.
201 *
202 * Anonymous buffers are buffers that are not associated with
203 * a DVA. These are buffers that hold dirty block copies
204 * before they are written to stable storage. By definition,
205 * they are "ref'd" and are considered part of arc_mru
206 * that cannot be freed. Generally, they will aquire a DVA
207 * as they are written and migrate onto the arc_mru list.
208 *
209 * The ARC_l2c_only state is for buffers that are in the second
210 * level ARC but no longer in any of the ARC_m* lists. The second
211 * level ARC itself may also contain buffers that are in any of
212 * the ARC_m* states - meaning that a buffer can exist in two
213 * places. The reason for the ARC_l2c_only state is to keep the
214 * buffer header in the hash table, so that reads that hit the
215 * second level ARC benefit from these fast lookups.
216 */
217
218 typedef struct arc_state {
219 list_t arcs_list[ARC_BUFC_NUMTYPES]; /* list of evictable buffers */
220 uint64_t arcs_lsize[ARC_BUFC_NUMTYPES]; /* amount of evictable data */
221 uint64_t arcs_size; /* total amount of data in this state */
222 kmutex_t arcs_mtx;
223 } arc_state_t;
224
225 /* The 6 states: */
226 static arc_state_t ARC_anon;
227 static arc_state_t ARC_mru;
228 static arc_state_t ARC_mru_ghost;
229 static arc_state_t ARC_mfu;
230 static arc_state_t ARC_mfu_ghost;
231 static arc_state_t ARC_l2c_only;
232
233 typedef struct arc_stats {
234 kstat_named_t arcstat_hits;
235 kstat_named_t arcstat_misses;
236 kstat_named_t arcstat_demand_data_hits;
237 kstat_named_t arcstat_demand_data_misses;
238 kstat_named_t arcstat_demand_metadata_hits;
239 kstat_named_t arcstat_demand_metadata_misses;
240 kstat_named_t arcstat_prefetch_data_hits;
241 kstat_named_t arcstat_prefetch_data_misses;
242 kstat_named_t arcstat_prefetch_metadata_hits;
243 kstat_named_t arcstat_prefetch_metadata_misses;
244 kstat_named_t arcstat_mru_hits;
245 kstat_named_t arcstat_mru_ghost_hits;
246 kstat_named_t arcstat_mfu_hits;
247 kstat_named_t arcstat_mfu_ghost_hits;
248 kstat_named_t arcstat_deleted;
249 kstat_named_t arcstat_recycle_miss;
250 kstat_named_t arcstat_mutex_miss;
251 kstat_named_t arcstat_evict_skip;
252 kstat_named_t arcstat_hash_elements;
253 kstat_named_t arcstat_hash_elements_max;
254 kstat_named_t arcstat_hash_collisions;
255 kstat_named_t arcstat_hash_chains;
256 kstat_named_t arcstat_hash_chain_max;
257 kstat_named_t arcstat_p;
258 kstat_named_t arcstat_c;
259 kstat_named_t arcstat_c_min;
260 kstat_named_t arcstat_c_max;
261 kstat_named_t arcstat_size;
262 kstat_named_t arcstat_hdr_size;
263 kstat_named_t arcstat_data_size;
264 kstat_named_t arcstat_other_size;
265 kstat_named_t arcstat_l2_hits;
266 kstat_named_t arcstat_l2_misses;
267 kstat_named_t arcstat_l2_feeds;
268 kstat_named_t arcstat_l2_rw_clash;
269 kstat_named_t arcstat_l2_read_bytes;
270 kstat_named_t arcstat_l2_write_bytes;
271 kstat_named_t arcstat_l2_writes_sent;
272 kstat_named_t arcstat_l2_writes_done;
273 kstat_named_t arcstat_l2_writes_error;
274 kstat_named_t arcstat_l2_writes_hdr_miss;
275 kstat_named_t arcstat_l2_evict_lock_retry;
276 kstat_named_t arcstat_l2_evict_reading;
277 kstat_named_t arcstat_l2_free_on_write;
278 kstat_named_t arcstat_l2_abort_lowmem;
279 kstat_named_t arcstat_l2_cksum_bad;
280 kstat_named_t arcstat_l2_io_error;
281 kstat_named_t arcstat_l2_size;
282 kstat_named_t arcstat_l2_hdr_size;
283 kstat_named_t arcstat_memory_throttle_count;
284 } arc_stats_t;
285
286 static arc_stats_t arc_stats = {
287 { "hits", KSTAT_DATA_UINT64 },
288 { "misses", KSTAT_DATA_UINT64 },
289 { "demand_data_hits", KSTAT_DATA_UINT64 },
290 { "demand_data_misses", KSTAT_DATA_UINT64 },
291 { "demand_metadata_hits", KSTAT_DATA_UINT64 },
292 { "demand_metadata_misses", KSTAT_DATA_UINT64 },
293 { "prefetch_data_hits", KSTAT_DATA_UINT64 },
294 { "prefetch_data_misses", KSTAT_DATA_UINT64 },
295 { "prefetch_metadata_hits", KSTAT_DATA_UINT64 },
296 { "prefetch_metadata_misses", KSTAT_DATA_UINT64 },
297 { "mru_hits", KSTAT_DATA_UINT64 },
298 { "mru_ghost_hits", KSTAT_DATA_UINT64 },
299 { "mfu_hits", KSTAT_DATA_UINT64 },
300 { "mfu_ghost_hits", KSTAT_DATA_UINT64 },
301 { "deleted", KSTAT_DATA_UINT64 },
302 { "recycle_miss", KSTAT_DATA_UINT64 },
303 { "mutex_miss", KSTAT_DATA_UINT64 },
304 { "evict_skip", KSTAT_DATA_UINT64 },
305 { "hash_elements", KSTAT_DATA_UINT64 },
306 { "hash_elements_max", KSTAT_DATA_UINT64 },
307 { "hash_collisions", KSTAT_DATA_UINT64 },
308 { "hash_chains", KSTAT_DATA_UINT64 },
309 { "hash_chain_max", KSTAT_DATA_UINT64 },
310 { "p", KSTAT_DATA_UINT64 },
311 { "c", KSTAT_DATA_UINT64 },
312 { "c_min", KSTAT_DATA_UINT64 },
313 { "c_max", KSTAT_DATA_UINT64 },
314 { "size", KSTAT_DATA_UINT64 },
315 { "hdr_size", KSTAT_DATA_UINT64 },
316 { "data_size", KSTAT_DATA_UINT64 },
317 { "other_size", KSTAT_DATA_UINT64 },
318 { "l2_hits", KSTAT_DATA_UINT64 },
319 { "l2_misses", KSTAT_DATA_UINT64 },
320 { "l2_feeds", KSTAT_DATA_UINT64 },
321 { "l2_rw_clash", KSTAT_DATA_UINT64 },
322 { "l2_read_bytes", KSTAT_DATA_UINT64 },
323 { "l2_write_bytes", KSTAT_DATA_UINT64 },
324 { "l2_writes_sent", KSTAT_DATA_UINT64 },
325 { "l2_writes_done", KSTAT_DATA_UINT64 },
326 { "l2_writes_error", KSTAT_DATA_UINT64 },
327 { "l2_writes_hdr_miss", KSTAT_DATA_UINT64 },
328 { "l2_evict_lock_retry", KSTAT_DATA_UINT64 },
329 { "l2_evict_reading", KSTAT_DATA_UINT64 },
330 { "l2_free_on_write", KSTAT_DATA_UINT64 },
331 { "l2_abort_lowmem", KSTAT_DATA_UINT64 },
332 { "l2_cksum_bad", KSTAT_DATA_UINT64 },
333 { "l2_io_error", KSTAT_DATA_UINT64 },
334 { "l2_size", KSTAT_DATA_UINT64 },
335 { "l2_hdr_size", KSTAT_DATA_UINT64 },
336 { "memory_throttle_count", KSTAT_DATA_UINT64 }
337 };
338
339 #define ARCSTAT(stat) (arc_stats.stat.value.ui64)
340
341 #define ARCSTAT_INCR(stat, val) \
342 atomic_add_64(&arc_stats.stat.value.ui64, (val));
343
344 #define ARCSTAT_BUMP(stat) ARCSTAT_INCR(stat, 1)
345 #define ARCSTAT_BUMPDOWN(stat) ARCSTAT_INCR(stat, -1)
346
347 #define ARCSTAT_MAX(stat, val) { \
348 uint64_t m; \
349 while ((val) > (m = arc_stats.stat.value.ui64) && \
350 (m != atomic_cas_64(&arc_stats.stat.value.ui64, m, (val)))) \
351 continue; \
352 }
353
354 #define ARCSTAT_MAXSTAT(stat) \
355 ARCSTAT_MAX(stat##_max, arc_stats.stat.value.ui64)
356
357 /*
358 * We define a macro to allow ARC hits/misses to be easily broken down by
359 * two separate conditions, giving a total of four different subtypes for
360 * each of hits and misses (so eight statistics total).
361 */
362 #define ARCSTAT_CONDSTAT(cond1, stat1, notstat1, cond2, stat2, notstat2, stat) \
363 if (cond1) { \
364 if (cond2) { \
365 ARCSTAT_BUMP(arcstat_##stat1##_##stat2##_##stat); \
366 } else { \
367 ARCSTAT_BUMP(arcstat_##stat1##_##notstat2##_##stat); \
368 } \
369 } else { \
370 if (cond2) { \
371 ARCSTAT_BUMP(arcstat_##notstat1##_##stat2##_##stat); \
372 } else { \
373 ARCSTAT_BUMP(arcstat_##notstat1##_##notstat2##_##stat);\
374 } \
375 }
376
377 kstat_t *arc_ksp;
378 static arc_state_t *arc_anon;
379 static arc_state_t *arc_mru;
380 static arc_state_t *arc_mru_ghost;
381 static arc_state_t *arc_mfu;
382 static arc_state_t *arc_mfu_ghost;
383 static arc_state_t *arc_l2c_only;
384
385 /*
386 * There are several ARC variables that are critical to export as kstats --
387 * but we don't want to have to grovel around in the kstat whenever we wish to
388 * manipulate them. For these variables, we therefore define them to be in
389 * terms of the statistic variable. This assures that we are not introducing
390 * the possibility of inconsistency by having shadow copies of the variables,
391 * while still allowing the code to be readable.
392 */
393 #define arc_size ARCSTAT(arcstat_size) /* actual total arc size */
394 #define arc_p ARCSTAT(arcstat_p) /* target size of MRU */
395 #define arc_c ARCSTAT(arcstat_c) /* target size of cache */
396 #define arc_c_min ARCSTAT(arcstat_c_min) /* min target cache size */
397 #define arc_c_max ARCSTAT(arcstat_c_max) /* max target cache size */
398
399 static int arc_no_grow; /* Don't try to grow cache size */
400 static uint64_t arc_tempreserve;
401 static uint64_t arc_loaned_bytes;
402 static uint64_t arc_meta_used;
403 static uint64_t arc_meta_limit;
404 static uint64_t arc_meta_max = 0;
405
406 typedef struct l2arc_buf_hdr l2arc_buf_hdr_t;
407
408 typedef struct arc_callback arc_callback_t;
409
410 struct arc_callback {
411 void *acb_private;
412 arc_done_func_t *acb_done;
413 arc_buf_t *acb_buf;
414 zio_t *acb_zio_dummy;
415 arc_callback_t *acb_next;
416 };
417
418 typedef struct arc_write_callback arc_write_callback_t;
419
420 struct arc_write_callback {
421 void *awcb_private;
422 arc_done_func_t *awcb_ready;
423 arc_done_func_t *awcb_done;
424 arc_buf_t *awcb_buf;
425 };
426
427 struct arc_buf_hdr {
428 /* protected by hash lock */
429 dva_t b_dva;
430 uint64_t b_birth;
431 uint64_t b_cksum0;
432
433 kmutex_t b_freeze_lock;
434 zio_cksum_t *b_freeze_cksum;
435
436 arc_buf_hdr_t *b_hash_next;
437 arc_buf_t *b_buf;
438 uint32_t b_flags;
439 uint32_t b_datacnt;
440
441 arc_callback_t *b_acb;
442 kcondvar_t b_cv;
443
444 /* immutable */
445 arc_buf_contents_t b_type;
446 uint64_t b_size;
447 uint64_t b_spa;
448
449 /* protected by arc state mutex */
450 arc_state_t *b_state;
451 list_node_t b_arc_node;
452
453 /* updated atomically */
454 clock_t b_arc_access;
455
456 /* self protecting */
457 refcount_t b_refcnt;
458
459 l2arc_buf_hdr_t *b_l2hdr;
460 list_node_t b_l2node;
461 };
462
463 static arc_buf_t *arc_eviction_list;
464 static kmutex_t arc_eviction_mtx;
465 static arc_buf_hdr_t arc_eviction_hdr;
466 static void arc_get_data_buf(arc_buf_t *buf);
467 static void arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock);
468 static int arc_evict_needed(arc_buf_contents_t type);
469 static void arc_evict_ghost(arc_state_t *state, uint64_t spa, int64_t bytes);
470
471 #define GHOST_STATE(state) \
472 ((state) == arc_mru_ghost || (state) == arc_mfu_ghost || \
473 (state) == arc_l2c_only)
474
475 /*
476 * Private ARC flags. These flags are private ARC only flags that will show up
477 * in b_flags in the arc_hdr_buf_t. Some flags are publicly declared, and can
478 * be passed in as arc_flags in things like arc_read. However, these flags
479 * should never be passed and should only be set by ARC code. When adding new
480 * public flags, make sure not to smash the private ones.
481 */
482
483 #define ARC_IN_HASH_TABLE (1 << 9) /* this buffer is hashed */
484 #define ARC_IO_IN_PROGRESS (1 << 10) /* I/O in progress for buf */
485 #define ARC_IO_ERROR (1 << 11) /* I/O failed for buf */
486 #define ARC_FREED_IN_READ (1 << 12) /* buf freed while in read */
487 #define ARC_BUF_AVAILABLE (1 << 13) /* block not in active use */
488 #define ARC_INDIRECT (1 << 14) /* this is an indirect block */
489 #define ARC_FREE_IN_PROGRESS (1 << 15) /* hdr about to be freed */
490 #define ARC_L2_WRITING (1 << 16) /* L2ARC write in progress */
491 #define ARC_L2_EVICTED (1 << 17) /* evicted during I/O */
492 #define ARC_L2_WRITE_HEAD (1 << 18) /* head of write list */
493 #define ARC_STORED (1 << 19) /* has been store()d to */
494
495 #define HDR_IN_HASH_TABLE(hdr) ((hdr)->b_flags & ARC_IN_HASH_TABLE)
496 #define HDR_IO_IN_PROGRESS(hdr) ((hdr)->b_flags & ARC_IO_IN_PROGRESS)
497 #define HDR_IO_ERROR(hdr) ((hdr)->b_flags & ARC_IO_ERROR)
498 #define HDR_PREFETCH(hdr) ((hdr)->b_flags & ARC_PREFETCH)
499 #define HDR_FREED_IN_READ(hdr) ((hdr)->b_flags & ARC_FREED_IN_READ)
500 #define HDR_BUF_AVAILABLE(hdr) ((hdr)->b_flags & ARC_BUF_AVAILABLE)
501 #define HDR_FREE_IN_PROGRESS(hdr) ((hdr)->b_flags & ARC_FREE_IN_PROGRESS)
502 #define HDR_L2CACHE(hdr) ((hdr)->b_flags & ARC_L2CACHE)
503 #define HDR_L2_READING(hdr) ((hdr)->b_flags & ARC_IO_IN_PROGRESS && \
504 (hdr)->b_l2hdr != NULL)
505 #define HDR_L2_WRITING(hdr) ((hdr)->b_flags & ARC_L2_WRITING)
506 #define HDR_L2_EVICTED(hdr) ((hdr)->b_flags & ARC_L2_EVICTED)
507 #define HDR_L2_WRITE_HEAD(hdr) ((hdr)->b_flags & ARC_L2_WRITE_HEAD)
508
509 /*
510 * Other sizes
511 */
512
513 #define HDR_SIZE ((int64_t)sizeof (arc_buf_hdr_t))
514 #define L2HDR_SIZE ((int64_t)sizeof (l2arc_buf_hdr_t))
515
516 /*
517 * Hash table routines
518 */
519
520 #define HT_LOCK_PAD 64
521
522 struct ht_lock {
523 kmutex_t ht_lock;
524 #ifdef _KERNEL
525 unsigned char pad[(HT_LOCK_PAD - sizeof (kmutex_t))];
526 #endif
527 };
528
529 #define BUF_LOCKS 256
530 typedef struct buf_hash_table {
531 uint64_t ht_mask;
532 arc_buf_hdr_t **ht_table;
533 struct ht_lock ht_locks[BUF_LOCKS];
534 } buf_hash_table_t;
535
536 static buf_hash_table_t buf_hash_table;
537
538 #define BUF_HASH_INDEX(spa, dva, birth) \
539 (buf_hash(spa, dva, birth) & buf_hash_table.ht_mask)
540 #define BUF_HASH_LOCK_NTRY(idx) (buf_hash_table.ht_locks[idx & (BUF_LOCKS-1)])
541 #define BUF_HASH_LOCK(idx) (&(BUF_HASH_LOCK_NTRY(idx).ht_lock))
542 #define HDR_LOCK(buf) \
543 (BUF_HASH_LOCK(BUF_HASH_INDEX(buf->b_spa, &buf->b_dva, buf->b_birth)))
544
545 uint64_t zfs_crc64_table[256];
546
547 /*
548 * Level 2 ARC
549 */
550
551 #define L2ARC_WRITE_SIZE (8 * 1024 * 1024) /* initial write max */
552 #define L2ARC_HEADROOM 2 /* num of writes */
553 #define L2ARC_FEED_SECS 1 /* caching interval secs */
554 #define L2ARC_FEED_MIN_MS 200 /* min caching interval ms */
555
556 #define l2arc_writes_sent ARCSTAT(arcstat_l2_writes_sent)
557 #define l2arc_writes_done ARCSTAT(arcstat_l2_writes_done)
558
559 /*
560 * L2ARC Performance Tunables
561 */
562 uint64_t l2arc_write_max = L2ARC_WRITE_SIZE; /* default max write size */
563 uint64_t l2arc_write_boost = L2ARC_WRITE_SIZE; /* extra write during warmup */
564 uint64_t l2arc_headroom = L2ARC_HEADROOM; /* number of dev writes */
565 uint64_t l2arc_feed_secs = L2ARC_FEED_SECS; /* interval seconds */
566 uint64_t l2arc_feed_min_ms = L2ARC_FEED_MIN_MS; /* min interval milliseconds */
567 boolean_t l2arc_noprefetch = B_TRUE; /* don't cache prefetch bufs */
568 boolean_t l2arc_feed_again = B_TRUE; /* turbo warmup */
569 boolean_t l2arc_norw = B_TRUE; /* no reads during writes */
570
571 /*
572 * L2ARC Internals
573 */
574 typedef struct l2arc_dev {
575 vdev_t *l2ad_vdev; /* vdev */
576 spa_t *l2ad_spa; /* spa */
577 uint64_t l2ad_hand; /* next write location */
578 uint64_t l2ad_write; /* desired write size, bytes */
579 uint64_t l2ad_boost; /* warmup write boost, bytes */
580 uint64_t l2ad_start; /* first addr on device */
581 uint64_t l2ad_end; /* last addr on device */
582 uint64_t l2ad_evict; /* last addr eviction reached */
583 boolean_t l2ad_first; /* first sweep through */
584 boolean_t l2ad_writing; /* currently writing */
585 list_t *l2ad_buflist; /* buffer list */
586 list_node_t l2ad_node; /* device list node */
587 } l2arc_dev_t;
588
589 static list_t L2ARC_dev_list; /* device list */
590 static list_t *l2arc_dev_list; /* device list pointer */
591 static kmutex_t l2arc_dev_mtx; /* device list mutex */
592 static l2arc_dev_t *l2arc_dev_last; /* last device used */
593 static kmutex_t l2arc_buflist_mtx; /* mutex for all buflists */
594 static list_t L2ARC_free_on_write; /* free after write buf list */
595 static list_t *l2arc_free_on_write; /* free after write list ptr */
596 static kmutex_t l2arc_free_on_write_mtx; /* mutex for list */
597 static uint64_t l2arc_ndev; /* number of devices */
598
599 typedef struct l2arc_read_callback {
600 arc_buf_t *l2rcb_buf; /* read buffer */
601 spa_t *l2rcb_spa; /* spa */
602 blkptr_t l2rcb_bp; /* original blkptr */
603 zbookmark_t l2rcb_zb; /* original bookmark */
604 int l2rcb_flags; /* original flags */
605 } l2arc_read_callback_t;
606
607 typedef struct l2arc_write_callback {
608 l2arc_dev_t *l2wcb_dev; /* device info */
609 arc_buf_hdr_t *l2wcb_head; /* head of write buflist */
610 } l2arc_write_callback_t;
611
612 struct l2arc_buf_hdr {
613 /* protected by arc_buf_hdr mutex */
614 l2arc_dev_t *b_dev; /* L2ARC device */
615 uint64_t b_daddr; /* disk address, offset byte */
616 };
617
618 typedef struct l2arc_data_free {
619 /* protected by l2arc_free_on_write_mtx */
620 void *l2df_data;
621 size_t l2df_size;
622 void (*l2df_func)(void *, size_t);
623 list_node_t l2df_list_node;
624 } l2arc_data_free_t;
625
626 static kmutex_t l2arc_feed_thr_lock;
627 static kcondvar_t l2arc_feed_thr_cv;
628 static uint8_t l2arc_thread_exit;
629
630 static void l2arc_read_done(zio_t *zio);
631 static void l2arc_hdr_stat_add(void);
632 static void l2arc_hdr_stat_remove(void);
633
634 static uint64_t
635 buf_hash(uint64_t spa, const dva_t *dva, uint64_t birth)
636 {
637 uint8_t *vdva = (uint8_t *)dva;
638 uint64_t crc = -1ULL;
639 int i;
640
641 ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY);
642
643 for (i = 0; i < sizeof (dva_t); i++)
644 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ vdva[i]) & 0xFF];
645
646 crc ^= (spa>>8) ^ birth;
647
648 return (crc);
649 }
650
651 #define BUF_EMPTY(buf) \
652 ((buf)->b_dva.dva_word[0] == 0 && \
653 (buf)->b_dva.dva_word[1] == 0 && \
654 (buf)->b_birth == 0)
655
656 #define BUF_EQUAL(spa, dva, birth, buf) \
657 ((buf)->b_dva.dva_word[0] == (dva)->dva_word[0]) && \
658 ((buf)->b_dva.dva_word[1] == (dva)->dva_word[1]) && \
659 ((buf)->b_birth == birth) && ((buf)->b_spa == spa)
660
661 static arc_buf_hdr_t *
662 buf_hash_find(uint64_t spa, const dva_t *dva, uint64_t birth, kmutex_t **lockp)
663 {
664 uint64_t idx = BUF_HASH_INDEX(spa, dva, birth);
665 kmutex_t *hash_lock = BUF_HASH_LOCK(idx);
666 arc_buf_hdr_t *buf;
667
668 mutex_enter(hash_lock);
669 for (buf = buf_hash_table.ht_table[idx]; buf != NULL;
670 buf = buf->b_hash_next) {
671 if (BUF_EQUAL(spa, dva, birth, buf)) {
672 *lockp = hash_lock;
673 return (buf);
674 }
675 }
676 mutex_exit(hash_lock);
677 *lockp = NULL;
678 return (NULL);
679 }
680
681 /*
682 * Insert an entry into the hash table. If there is already an element
683 * equal to elem in the hash table, then the already existing element
684 * will be returned and the new element will not be inserted.
685 * Otherwise returns NULL.
686 */
687 static arc_buf_hdr_t *
688 buf_hash_insert(arc_buf_hdr_t *buf, kmutex_t **lockp)
689 {
690 uint64_t idx = BUF_HASH_INDEX(buf->b_spa, &buf->b_dva, buf->b_birth);
691 kmutex_t *hash_lock = BUF_HASH_LOCK(idx);
692 arc_buf_hdr_t *fbuf;
693 uint32_t i;
694
695 ASSERT(!HDR_IN_HASH_TABLE(buf));
696 *lockp = hash_lock;
697 mutex_enter(hash_lock);
698 for (fbuf = buf_hash_table.ht_table[idx], i = 0; fbuf != NULL;
699 fbuf = fbuf->b_hash_next, i++) {
700 if (BUF_EQUAL(buf->b_spa, &buf->b_dva, buf->b_birth, fbuf))
701 return (fbuf);
702 }
703
704 buf->b_hash_next = buf_hash_table.ht_table[idx];
705 buf_hash_table.ht_table[idx] = buf;
706 buf->b_flags |= ARC_IN_HASH_TABLE;
707
708 /* collect some hash table performance data */
709 if (i > 0) {
710 ARCSTAT_BUMP(arcstat_hash_collisions);
711 if (i == 1)
712 ARCSTAT_BUMP(arcstat_hash_chains);
713
714 ARCSTAT_MAX(arcstat_hash_chain_max, i);
715 }
716
717 ARCSTAT_BUMP(arcstat_hash_elements);
718 ARCSTAT_MAXSTAT(arcstat_hash_elements);
719
720 return (NULL);
721 }
722
723 static void
724 buf_hash_remove(arc_buf_hdr_t *buf)
725 {
726 arc_buf_hdr_t *fbuf, **bufp;
727 uint64_t idx = BUF_HASH_INDEX(buf->b_spa, &buf->b_dva, buf->b_birth);
728
729 ASSERT(MUTEX_HELD(BUF_HASH_LOCK(idx)));
730 ASSERT(HDR_IN_HASH_TABLE(buf));
731
732 bufp = &buf_hash_table.ht_table[idx];
733 while ((fbuf = *bufp) != buf) {
734 ASSERT(fbuf != NULL);
735 bufp = &fbuf->b_hash_next;
736 }
737 *bufp = buf->b_hash_next;
738 buf->b_hash_next = NULL;
739 buf->b_flags &= ~ARC_IN_HASH_TABLE;
740
741 /* collect some hash table performance data */
742 ARCSTAT_BUMPDOWN(arcstat_hash_elements);
743
744 if (buf_hash_table.ht_table[idx] &&
745 buf_hash_table.ht_table[idx]->b_hash_next == NULL)
746 ARCSTAT_BUMPDOWN(arcstat_hash_chains);
747 }
748
749 /*
750 * Global data structures and functions for the buf kmem cache.
751 */
752 static kmem_cache_t *hdr_cache;
753 static kmem_cache_t *buf_cache;
754
755 static void
756 buf_fini(void)
757 {
758 int i;
759
760 kmem_free(buf_hash_table.ht_table,
761 (buf_hash_table.ht_mask + 1) * sizeof (void *));
762 for (i = 0; i < BUF_LOCKS; i++)
763 mutex_destroy(&buf_hash_table.ht_locks[i].ht_lock);
764 kmem_cache_destroy(hdr_cache);
765 kmem_cache_destroy(buf_cache);
766 }
767
768 /*
769 * Constructor callback - called when the cache is empty
770 * and a new buf is requested.
771 */
772 /* ARGSUSED */
773 static int
774 hdr_cons(void *vbuf, void *unused, int kmflag)
775 {
776 arc_buf_hdr_t *buf = vbuf;
777
778 bzero(buf, sizeof (arc_buf_hdr_t));
779 refcount_create(&buf->b_refcnt);
780 cv_init(&buf->b_cv, NULL, CV_DEFAULT, NULL);
781 mutex_init(&buf->b_freeze_lock, NULL, MUTEX_DEFAULT, NULL);
782 arc_space_consume(sizeof (arc_buf_hdr_t), ARC_SPACE_HDRS);
783
784 return (0);
785 }
786
787 /* ARGSUSED */
788 static int
789 buf_cons(void *vbuf, void *unused, int kmflag)
790 {
791 arc_buf_t *buf = vbuf;
792
793 bzero(buf, sizeof (arc_buf_t));
794 rw_init(&buf->b_lock, NULL, RW_DEFAULT, NULL);
795 arc_space_consume(sizeof (arc_buf_t), ARC_SPACE_HDRS);
796
797 return (0);
798 }
799
800 /*
801 * Destructor callback - called when a cached buf is
802 * no longer required.
803 */
804 /* ARGSUSED */
805 static void
806 hdr_dest(void *vbuf, void *unused)
807 {
808 arc_buf_hdr_t *buf = vbuf;
809
810 refcount_destroy(&buf->b_refcnt);
811 cv_destroy(&buf->b_cv);
812 mutex_destroy(&buf->b_freeze_lock);
813 arc_space_return(sizeof (arc_buf_hdr_t), ARC_SPACE_HDRS);
814 }
815
816 /* ARGSUSED */
817 static void
818 buf_dest(void *vbuf, void *unused)
819 {
820 arc_buf_t *buf = vbuf;
821
822 rw_destroy(&buf->b_lock);
823 arc_space_return(sizeof (arc_buf_t), ARC_SPACE_HDRS);
824 }
825
826 /*
827 * Reclaim callback -- invoked when memory is low.
828 */
829 /* ARGSUSED */
830 static void
831 hdr_recl(void *unused)
832 {
833 dprintf("hdr_recl called\n");
834 /*
835 * umem calls the reclaim func when we destroy the buf cache,
836 * which is after we do arc_fini().
837 */
838 if (!arc_dead)
839 cv_signal(&arc_reclaim_thr_cv);
840 }
841
842 static void
843 buf_init(void)
844 {
845 uint64_t *ct;
846 uint64_t hsize = 1ULL << 12;
847 int i, j;
848
849 /*
850 * The hash table is big enough to fill all of physical memory
851 * with an average 64K block size. The table will take up
852 * totalmem*sizeof(void*)/64K (eg. 128KB/GB with 8-byte pointers).
853 */
854 while (hsize * 65536 < physmem * PAGESIZE)
855 hsize <<= 1;
856 retry:
857 buf_hash_table.ht_mask = hsize - 1;
858 buf_hash_table.ht_table =
859 kmem_zalloc(hsize * sizeof (void*), KM_NOSLEEP);
860 if (buf_hash_table.ht_table == NULL) {
861 ASSERT(hsize > (1ULL << 8));
862 hsize >>= 1;
863 goto retry;
864 }
865
866 hdr_cache = kmem_cache_create("arc_buf_hdr_t", sizeof (arc_buf_hdr_t),
867 0, hdr_cons, hdr_dest, hdr_recl, NULL, NULL, 0);
868 buf_cache = kmem_cache_create("arc_buf_t", sizeof (arc_buf_t),
869 0, buf_cons, buf_dest, NULL, NULL, NULL, 0);
870
871 for (i = 0; i < 256; i++)
872 for (ct = zfs_crc64_table + i, *ct = i, j = 8; j > 0; j--)
873 *ct = (*ct >> 1) ^ (-(*ct & 1) & ZFS_CRC64_POLY);
874
875 for (i = 0; i < BUF_LOCKS; i++) {
876 mutex_init(&buf_hash_table.ht_locks[i].ht_lock,
877 NULL, MUTEX_DEFAULT, NULL);
878 }
879 }
880
881 #define ARC_MINTIME (hz>>4) /* 62 ms */
882
883 static void
884 arc_cksum_verify(arc_buf_t *buf)
885 {
886 zio_cksum_t zc;
887
888 if (!(zfs_flags & ZFS_DEBUG_MODIFY))
889 return;
890
891 mutex_enter(&buf->b_hdr->b_freeze_lock);
892 if (buf->b_hdr->b_freeze_cksum == NULL ||
893 (buf->b_hdr->b_flags & ARC_IO_ERROR)) {
894 mutex_exit(&buf->b_hdr->b_freeze_lock);
895 return;
896 }
897 fletcher_2_native(buf->b_data, buf->b_hdr->b_size, &zc);
898 if (!ZIO_CHECKSUM_EQUAL(*buf->b_hdr->b_freeze_cksum, zc))
899 panic("buffer modified while frozen!");
900 mutex_exit(&buf->b_hdr->b_freeze_lock);
901 }
902
903 static int
904 arc_cksum_equal(arc_buf_t *buf)
905 {
906 zio_cksum_t zc;
907 int equal;
908
909 mutex_enter(&buf->b_hdr->b_freeze_lock);
910 fletcher_2_native(buf->b_data, buf->b_hdr->b_size, &zc);
911 equal = ZIO_CHECKSUM_EQUAL(*buf->b_hdr->b_freeze_cksum, zc);
912 mutex_exit(&buf->b_hdr->b_freeze_lock);
913
914 return (equal);
915 }
916
917 static void
918 arc_cksum_compute(arc_buf_t *buf, boolean_t force)
919 {
920 if (!force && !(zfs_flags & ZFS_DEBUG_MODIFY))
921 return;
922
923 mutex_enter(&buf->b_hdr->b_freeze_lock);
924 if (buf->b_hdr->b_freeze_cksum != NULL) {
925 mutex_exit(&buf->b_hdr->b_freeze_lock);
926 return;
927 }
928 buf->b_hdr->b_freeze_cksum = kmem_alloc(sizeof (zio_cksum_t), KM_SLEEP);
929 fletcher_2_native(buf->b_data, buf->b_hdr->b_size,
930 buf->b_hdr->b_freeze_cksum);
931 mutex_exit(&buf->b_hdr->b_freeze_lock);
932 }
933
934 void
935 arc_buf_thaw(arc_buf_t *buf)
936 {
937 if (zfs_flags & ZFS_DEBUG_MODIFY) {
938 if (buf->b_hdr->b_state != arc_anon)
939 panic("modifying non-anon buffer!");
940 if (buf->b_hdr->b_flags & ARC_IO_IN_PROGRESS)
941 panic("modifying buffer while i/o in progress!");
942 arc_cksum_verify(buf);
943 }
944
945 mutex_enter(&buf->b_hdr->b_freeze_lock);
946 if (buf->b_hdr->b_freeze_cksum != NULL) {
947 kmem_free(buf->b_hdr->b_freeze_cksum, sizeof (zio_cksum_t));
948 buf->b_hdr->b_freeze_cksum = NULL;
949 }
950 mutex_exit(&buf->b_hdr->b_freeze_lock);
951 }
952
953 void
954 arc_buf_freeze(arc_buf_t *buf)
955 {
956 if (!(zfs_flags & ZFS_DEBUG_MODIFY))
957 return;
958
959 ASSERT(buf->b_hdr->b_freeze_cksum != NULL ||
960 buf->b_hdr->b_state == arc_anon);
961 arc_cksum_compute(buf, B_FALSE);
962 }
963
964 static void
965 add_reference(arc_buf_hdr_t *ab, kmutex_t *hash_lock, void *tag)
966 {
967 ASSERT(MUTEX_HELD(hash_lock));
968
969 if ((refcount_add(&ab->b_refcnt, tag) == 1) &&
970 (ab->b_state != arc_anon)) {
971 uint64_t delta = ab->b_size * ab->b_datacnt;
972 list_t *list = &ab->b_state->arcs_list[ab->b_type];
973 uint64_t *size = &ab->b_state->arcs_lsize[ab->b_type];
974
975 ASSERT(!MUTEX_HELD(&ab->b_state->arcs_mtx));
976 mutex_enter(&ab->b_state->arcs_mtx);
977 ASSERT(list_link_active(&ab->b_arc_node));
978 list_remove(list, ab);
979 if (GHOST_STATE(ab->b_state)) {
980 ASSERT3U(ab->b_datacnt, ==, 0);
981 ASSERT3P(ab->b_buf, ==, NULL);
982 delta = ab->b_size;
983 }
984 ASSERT(delta > 0);
985 ASSERT3U(*size, >=, delta);
986 atomic_add_64(size, -delta);
987 mutex_exit(&ab->b_state->arcs_mtx);
988 /* remove the prefetch flag if we get a reference */
989 if (ab->b_flags & ARC_PREFETCH)
990 ab->b_flags &= ~ARC_PREFETCH;
991 }
992 }
993
994 static int
995 remove_reference(arc_buf_hdr_t *ab, kmutex_t *hash_lock, void *tag)
996 {
997 int cnt;
998 arc_state_t *state = ab->b_state;
999
1000 ASSERT(state == arc_anon || MUTEX_HELD(hash_lock));
1001 ASSERT(!GHOST_STATE(state));
1002
1003 if (((cnt = refcount_remove(&ab->b_refcnt, tag)) == 0) &&
1004 (state != arc_anon)) {
1005 uint64_t *size = &state->arcs_lsize[ab->b_type];
1006
1007 ASSERT(!MUTEX_HELD(&state->arcs_mtx));
1008 mutex_enter(&state->arcs_mtx);
1009 ASSERT(!list_link_active(&ab->b_arc_node));
1010 list_insert_head(&state->arcs_list[ab->b_type], ab);
1011 ASSERT(ab->b_datacnt > 0);
1012 atomic_add_64(size, ab->b_size * ab->b_datacnt);
1013 mutex_exit(&state->arcs_mtx);
1014 }
1015 return (cnt);
1016 }
1017
1018 /*
1019 * Move the supplied buffer to the indicated state. The mutex
1020 * for the buffer must be held by the caller.
1021 */
1022 static void
1023 arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *ab, kmutex_t *hash_lock)
1024 {
1025 arc_state_t *old_state = ab->b_state;
1026 int64_t refcnt = refcount_count(&ab->b_refcnt);
1027 uint64_t from_delta, to_delta;
1028
1029 ASSERT(MUTEX_HELD(hash_lock));
1030 ASSERT(new_state != old_state);
1031 ASSERT(refcnt == 0 || ab->b_datacnt > 0);
1032 ASSERT(ab->b_datacnt == 0 || !GHOST_STATE(new_state));
1033
1034 from_delta = to_delta = ab->b_datacnt * ab->b_size;
1035
1036 /*
1037 * If this buffer is evictable, transfer it from the
1038 * old state list to the new state list.
1039 */
1040 if (refcnt == 0) {
1041 if (old_state != arc_anon) {
1042 int use_mutex = !MUTEX_HELD(&old_state->arcs_mtx);
1043 uint64_t *size = &old_state->arcs_lsize[ab->b_type];
1044
1045 if (use_mutex)
1046 mutex_enter(&old_state->arcs_mtx);
1047
1048 ASSERT(list_link_active(&ab->b_arc_node));
1049 list_remove(&old_state->arcs_list[ab->b_type], ab);
1050
1051 /*
1052 * If prefetching out of the ghost cache,
1053 * we will have a non-null datacnt.
1054 */
1055 if (GHOST_STATE(old_state) && ab->b_datacnt == 0) {
1056 /* ghost elements have a ghost size */
1057 ASSERT(ab->b_buf == NULL);
1058 from_delta = ab->b_size;
1059 }
1060 ASSERT3U(*size, >=, from_delta);
1061 atomic_add_64(size, -from_delta);
1062
1063 if (use_mutex)
1064 mutex_exit(&old_state->arcs_mtx);
1065 }
1066 if (new_state != arc_anon) {
1067 int use_mutex = !MUTEX_HELD(&new_state->arcs_mtx);
1068 uint64_t *size = &new_state->arcs_lsize[ab->b_type];
1069
1070 if (use_mutex)
1071 mutex_enter(&new_state->arcs_mtx);
1072
1073 list_insert_head(&new_state->arcs_list[ab->b_type], ab);
1074
1075 /* ghost elements have a ghost size */
1076 if (GHOST_STATE(new_state)) {
1077 ASSERT(ab->b_datacnt == 0);
1078 ASSERT(ab->b_buf == NULL);
1079 to_delta = ab->b_size;
1080 }
1081 atomic_add_64(size, to_delta);
1082
1083 if (use_mutex)
1084 mutex_exit(&new_state->arcs_mtx);
1085 }
1086 }
1087
1088 ASSERT(!BUF_EMPTY(ab));
1089 if (new_state == arc_anon) {
1090 buf_hash_remove(ab);
1091 }
1092
1093 /* adjust state sizes */
1094 if (to_delta)
1095 atomic_add_64(&new_state->arcs_size, to_delta);
1096 if (from_delta) {
1097 ASSERT3U(old_state->arcs_size, >=, from_delta);
1098 atomic_add_64(&old_state->arcs_size, -from_delta);
1099 }
1100 ab->b_state = new_state;
1101
1102 /* adjust l2arc hdr stats */
1103 if (new_state == arc_l2c_only)
1104 l2arc_hdr_stat_add();
1105 else if (old_state == arc_l2c_only)
1106 l2arc_hdr_stat_remove();
1107 }
1108
1109 void
1110 arc_space_consume(uint64_t space, arc_space_type_t type)
1111 {
1112 ASSERT(type >= 0 && type < ARC_SPACE_NUMTYPES);
1113
1114 switch (type) {
1115 case ARC_SPACE_DATA:
1116 ARCSTAT_INCR(arcstat_data_size, space);
1117 break;
1118 case ARC_SPACE_OTHER:
1119 ARCSTAT_INCR(arcstat_other_size, space);
1120 break;
1121 case ARC_SPACE_HDRS:
1122 ARCSTAT_INCR(arcstat_hdr_size, space);
1123 break;
1124 case ARC_SPACE_L2HDRS:
1125 ARCSTAT_INCR(arcstat_l2_hdr_size, space);
1126 break;
1127 }
1128
1129 atomic_add_64(&arc_meta_used, space);
1130 atomic_add_64(&arc_size, space);
1131 }
1132
1133 void
1134 arc_space_return(uint64_t space, arc_space_type_t type)
1135 {
1136 ASSERT(type >= 0 && type < ARC_SPACE_NUMTYPES);
1137
1138 switch (type) {
1139 case ARC_SPACE_DATA:
1140 ARCSTAT_INCR(arcstat_data_size, -space);
1141 break;
1142 case ARC_SPACE_OTHER:
1143 ARCSTAT_INCR(arcstat_other_size, -space);
1144 break;
1145 case ARC_SPACE_HDRS:
1146 ARCSTAT_INCR(arcstat_hdr_size, -space);
1147 break;
1148 case ARC_SPACE_L2HDRS:
1149 ARCSTAT_INCR(arcstat_l2_hdr_size, -space);
1150 break;
1151 }
1152
1153 ASSERT(arc_meta_used >= space);
1154 if (arc_meta_max < arc_meta_used)
1155 arc_meta_max = arc_meta_used;
1156 atomic_add_64(&arc_meta_used, -space);
1157 ASSERT(arc_size >= space);
1158 atomic_add_64(&arc_size, -space);
1159 }
1160
1161 void *
1162 arc_data_buf_alloc(uint64_t size)
1163 {
1164 if (arc_evict_needed(ARC_BUFC_DATA))
1165 cv_signal(&arc_reclaim_thr_cv);
1166 atomic_add_64(&arc_size, size);
1167 return (zio_data_buf_alloc(size));
1168 }
1169
1170 void
1171 arc_data_buf_free(void *buf, uint64_t size)
1172 {
1173 zio_data_buf_free(buf, size);
1174 ASSERT(arc_size >= size);
1175 atomic_add_64(&arc_size, -size);
1176 }
1177
1178 arc_buf_t *
1179 arc_buf_alloc(spa_t *spa, int size, void *tag, arc_buf_contents_t type)
1180 {
1181 arc_buf_hdr_t *hdr;
1182 arc_buf_t *buf;
1183
1184 ASSERT3U(size, >, 0);
1185 hdr = kmem_cache_alloc(hdr_cache, KM_PUSHPAGE);
1186 ASSERT(BUF_EMPTY(hdr));
1187 hdr->b_size = size;
1188 hdr->b_type = type;
1189 hdr->b_spa = spa_guid(spa);
1190 hdr->b_state = arc_anon;
1191 hdr->b_arc_access = 0;
1192 buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE);
1193 buf->b_hdr = hdr;
1194 buf->b_data = NULL;
1195 buf->b_efunc = NULL;
1196 buf->b_private = NULL;
1197 buf->b_next = NULL;
1198 hdr->b_buf = buf;
1199 arc_get_data_buf(buf);
1200 hdr->b_datacnt = 1;
1201 hdr->b_flags = 0;
1202 ASSERT(refcount_is_zero(&hdr->b_refcnt));
1203 (void) refcount_add(&hdr->b_refcnt, tag);
1204
1205 return (buf);
1206 }
1207
1208 static char *arc_onloan_tag = "onloan";
1209
1210 /*
1211 * Loan out an anonymous arc buffer. Loaned buffers are not counted as in
1212 * flight data by arc_tempreserve_space() until they are "returned". Loaned
1213 * buffers must be returned to the arc before they can be used by the DMU or
1214 * freed.
1215 */
1216 arc_buf_t *
1217 arc_loan_buf(spa_t *spa, int size)
1218 {
1219 arc_buf_t *buf;
1220
1221 buf = arc_buf_alloc(spa, size, arc_onloan_tag, ARC_BUFC_DATA);
1222
1223 atomic_add_64(&arc_loaned_bytes, size);
1224 return (buf);
1225 }
1226
1227 /*
1228 * Return a loaned arc buffer to the arc.
1229 */
1230 void
1231 arc_return_buf(arc_buf_t *buf, void *tag)
1232 {
1233 arc_buf_hdr_t *hdr = buf->b_hdr;
1234
1235 ASSERT(hdr->b_state == arc_anon);
1236 ASSERT(buf->b_data != NULL);
1237 VERIFY(refcount_remove(&hdr->b_refcnt, arc_onloan_tag) == 0);
1238 VERIFY(refcount_add(&hdr->b_refcnt, tag) == 1);
1239
1240 atomic_add_64(&arc_loaned_bytes, -hdr->b_size);
1241 }
1242
1243 static arc_buf_t *
1244 arc_buf_clone(arc_buf_t *from)
1245 {
1246 arc_buf_t *buf;
1247 arc_buf_hdr_t *hdr = from->b_hdr;
1248 uint64_t size = hdr->b_size;
1249
1250 buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE);
1251 buf->b_hdr = hdr;
1252 buf->b_data = NULL;
1253 buf->b_efunc = NULL;
1254 buf->b_private = NULL;
1255 buf->b_next = hdr->b_buf;
1256 hdr->b_buf = buf;
1257 arc_get_data_buf(buf);
1258 bcopy(from->b_data, buf->b_data, size);
1259 hdr->b_datacnt += 1;
1260 return (buf);
1261 }
1262
1263 void
1264 arc_buf_add_ref(arc_buf_t *buf, void* tag)
1265 {
1266 arc_buf_hdr_t *hdr;
1267 kmutex_t *hash_lock;
1268
1269 /*
1270 * Check to see if this buffer is evicted. Callers
1271 * must verify b_data != NULL to know if the add_ref
1272 * was successful.
1273 */
1274 rw_enter(&buf->b_lock, RW_READER);
1275 if (buf->b_data == NULL) {
1276 rw_exit(&buf->b_lock);
1277 return;
1278 }
1279 hdr = buf->b_hdr;
1280 ASSERT(hdr != NULL);
1281 hash_lock = HDR_LOCK(hdr);
1282 mutex_enter(hash_lock);
1283 rw_exit(&buf->b_lock);
1284
1285 ASSERT(hdr->b_state == arc_mru || hdr->b_state == arc_mfu);
1286 add_reference(hdr, hash_lock, tag);
1287 DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr);
1288 arc_access(hdr, hash_lock);
1289 mutex_exit(hash_lock);
1290 ARCSTAT_BUMP(arcstat_hits);
1291 ARCSTAT_CONDSTAT(!(hdr->b_flags & ARC_PREFETCH),
1292 demand, prefetch, hdr->b_type != ARC_BUFC_METADATA,
1293 data, metadata, hits);
1294 }
1295
1296 /*
1297 * Free the arc data buffer. If it is an l2arc write in progress,
1298 * the buffer is placed on l2arc_free_on_write to be freed later.
1299 */
1300 static void
1301 arc_buf_data_free(arc_buf_hdr_t *hdr, void (*free_func)(void *, size_t),
1302 void *data, size_t size)
1303 {
1304 if (HDR_L2_WRITING(hdr)) {
1305 l2arc_data_free_t *df;
1306 df = kmem_alloc(sizeof (l2arc_data_free_t), KM_SLEEP);
1307 df->l2df_data = data;
1308 df->l2df_size = size;
1309 df->l2df_func = free_func;
1310 mutex_enter(&l2arc_free_on_write_mtx);
1311 list_insert_head(l2arc_free_on_write, df);
1312 mutex_exit(&l2arc_free_on_write_mtx);
1313 ARCSTAT_BUMP(arcstat_l2_free_on_write);
1314 } else {
1315 free_func(data, size);
1316 }
1317 }
1318
1319 static void
1320 arc_buf_destroy(arc_buf_t *buf, boolean_t recycle, boolean_t all)
1321 {
1322 arc_buf_t **bufp;
1323
1324 /* free up data associated with the buf */
1325 if (buf->b_data) {
1326 arc_state_t *state = buf->b_hdr->b_state;
1327 uint64_t size = buf->b_hdr->b_size;
1328 arc_buf_contents_t type = buf->b_hdr->b_type;
1329
1330 arc_cksum_verify(buf);
1331 if (!recycle) {
1332 if (type == ARC_BUFC_METADATA) {
1333 arc_buf_data_free(buf->b_hdr, zio_buf_free,
1334 buf->b_data, size);
1335 arc_space_return(size, ARC_SPACE_DATA);
1336 } else {
1337 ASSERT(type == ARC_BUFC_DATA);
1338 arc_buf_data_free(buf->b_hdr,
1339 zio_data_buf_free, buf->b_data, size);
1340 ARCSTAT_INCR(arcstat_data_size, -size);
1341 atomic_add_64(&arc_size, -size);
1342 }
1343 }
1344 if (list_link_active(&buf->b_hdr->b_arc_node)) {
1345 uint64_t *cnt = &state->arcs_lsize[type];
1346
1347 ASSERT(refcount_is_zero(&buf->b_hdr->b_refcnt));
1348 ASSERT(state != arc_anon);
1349
1350 ASSERT3U(*cnt, >=, size);
1351 atomic_add_64(cnt, -size);
1352 }
1353 ASSERT3U(state->arcs_size, >=, size);
1354 atomic_add_64(&state->arcs_size, -size);
1355 buf->b_data = NULL;
1356 ASSERT(buf->b_hdr->b_datacnt > 0);
1357 buf->b_hdr->b_datacnt -= 1;
1358 }
1359
1360 /* only remove the buf if requested */
1361 if (!all)
1362 return;
1363
1364 /* remove the buf from the hdr list */
1365 for (bufp = &buf->b_hdr->b_buf; *bufp != buf; bufp = &(*bufp)->b_next)
1366 continue;
1367 *bufp = buf->b_next;
1368
1369 ASSERT(buf->b_efunc == NULL);
1370
1371 /* clean up the buf */
1372 buf->b_hdr = NULL;
1373 kmem_cache_free(buf_cache, buf);
1374 }
1375
1376 static void
1377 arc_hdr_destroy(arc_buf_hdr_t *hdr)
1378 {
1379 ASSERT(refcount_is_zero(&hdr->b_refcnt));
1380 ASSERT3P(hdr->b_state, ==, arc_anon);
1381 ASSERT(!HDR_IO_IN_PROGRESS(hdr));
1382 ASSERT(!(hdr->b_flags & ARC_STORED));
1383
1384 if (hdr->b_l2hdr != NULL) {
1385 if (!MUTEX_HELD(&l2arc_buflist_mtx)) {
1386 /*
1387 * To prevent arc_free() and l2arc_evict() from
1388 * attempting to free the same buffer at the same time,
1389 * a FREE_IN_PROGRESS flag is given to arc_free() to
1390 * give it priority. l2arc_evict() can't destroy this
1391 * header while we are waiting on l2arc_buflist_mtx.
1392 *
1393 * The hdr may be removed from l2ad_buflist before we
1394 * grab l2arc_buflist_mtx, so b_l2hdr is rechecked.
1395 */
1396 mutex_enter(&l2arc_buflist_mtx);
1397 if (hdr->b_l2hdr != NULL) {
1398 list_remove(hdr->b_l2hdr->b_dev->l2ad_buflist,
1399 hdr);
1400 }
1401 mutex_exit(&l2arc_buflist_mtx);
1402 } else {
1403 list_remove(hdr->b_l2hdr->b_dev->l2ad_buflist, hdr);
1404 }
1405 ARCSTAT_INCR(arcstat_l2_size, -hdr->b_size);
1406 kmem_free(hdr->b_l2hdr, sizeof (l2arc_buf_hdr_t));
1407 if (hdr->b_state == arc_l2c_only)
1408 l2arc_hdr_stat_remove();
1409 hdr->b_l2hdr = NULL;
1410 }
1411
1412 if (!BUF_EMPTY(hdr)) {
1413 ASSERT(!HDR_IN_HASH_TABLE(hdr));
1414 bzero(&hdr->b_dva, sizeof (dva_t));
1415 hdr->b_birth = 0;
1416 hdr->b_cksum0 = 0;
1417 }
1418 while (hdr->b_buf) {
1419 arc_buf_t *buf = hdr->b_buf;
1420
1421 if (buf->b_efunc) {
1422 mutex_enter(&arc_eviction_mtx);
1423 rw_enter(&buf->b_lock, RW_WRITER);
1424 ASSERT(buf->b_hdr != NULL);
1425 arc_buf_destroy(hdr->b_buf, FALSE, FALSE);
1426 hdr->b_buf = buf->b_next;
1427 buf->b_hdr = &arc_eviction_hdr;
1428 buf->b_next = arc_eviction_list;
1429 arc_eviction_list = buf;
1430 rw_exit(&buf->b_lock);
1431 mutex_exit(&arc_eviction_mtx);
1432 } else {
1433 arc_buf_destroy(hdr->b_buf, FALSE, TRUE);
1434 }
1435 }
1436 if (hdr->b_freeze_cksum != NULL) {
1437 kmem_free(hdr->b_freeze_cksum, sizeof (zio_cksum_t));
1438 hdr->b_freeze_cksum = NULL;
1439 }
1440
1441 ASSERT(!list_link_active(&hdr->b_arc_node));
1442 ASSERT3P(hdr->b_hash_next, ==, NULL);
1443 ASSERT3P(hdr->b_acb, ==, NULL);
1444 kmem_cache_free(hdr_cache, hdr);
1445 }
1446
1447 void
1448 arc_buf_free(arc_buf_t *buf, void *tag)
1449 {
1450 arc_buf_hdr_t *hdr = buf->b_hdr;
1451 int hashed = hdr->b_state != arc_anon;
1452
1453 ASSERT(buf->b_efunc == NULL);
1454 ASSERT(buf->b_data != NULL);
1455
1456 if (hashed) {
1457 kmutex_t *hash_lock = HDR_LOCK(hdr);
1458
1459 mutex_enter(hash_lock);
1460 (void) remove_reference(hdr, hash_lock, tag);
1461 if (hdr->b_datacnt > 1)
1462 arc_buf_destroy(buf, FALSE, TRUE);
1463 else
1464 hdr->b_flags |= ARC_BUF_AVAILABLE;
1465 mutex_exit(hash_lock);
1466 } else if (HDR_IO_IN_PROGRESS(hdr)) {
1467 int destroy_hdr;
1468 /*
1469 * We are in the middle of an async write. Don't destroy
1470 * this buffer unless the write completes before we finish
1471 * decrementing the reference count.
1472 */
1473 mutex_enter(&arc_eviction_mtx);
1474 (void) remove_reference(hdr, NULL, tag);
1475 ASSERT(refcount_is_zero(&hdr->b_refcnt));
1476 destroy_hdr = !HDR_IO_IN_PROGRESS(hdr);
1477 mutex_exit(&arc_eviction_mtx);
1478 if (destroy_hdr)
1479 arc_hdr_destroy(hdr);
1480 } else {
1481 if (remove_reference(hdr, NULL, tag) > 0) {
1482 ASSERT(HDR_IO_ERROR(hdr));
1483 arc_buf_destroy(buf, FALSE, TRUE);
1484 } else {
1485 arc_hdr_destroy(hdr);
1486 }
1487 }
1488 }
1489
1490 int
1491 arc_buf_remove_ref(arc_buf_t *buf, void* tag)
1492 {
1493 arc_buf_hdr_t *hdr = buf->b_hdr;
1494 kmutex_t *hash_lock = HDR_LOCK(hdr);
1495 int no_callback = (buf->b_efunc == NULL);
1496
1497 if (hdr->b_state == arc_anon) {
1498 arc_buf_free(buf, tag);
1499 return (no_callback);
1500 }
1501
1502 mutex_enter(hash_lock);
1503 ASSERT(hdr->b_state != arc_anon);
1504 ASSERT(buf->b_data != NULL);
1505
1506 (void) remove_reference(hdr, hash_lock, tag);
1507 if (hdr->b_datacnt > 1) {
1508 if (no_callback)
1509 arc_buf_destroy(buf, FALSE, TRUE);
1510 } else if (no_callback) {
1511 ASSERT(hdr->b_buf == buf && buf->b_next == NULL);
1512 hdr->b_flags |= ARC_BUF_AVAILABLE;
1513 }
1514 ASSERT(no_callback || hdr->b_datacnt > 1 ||
1515 refcount_is_zero(&hdr->b_refcnt));
1516 mutex_exit(hash_lock);
1517 return (no_callback);
1518 }
1519
1520 int
1521 arc_buf_size(arc_buf_t *buf)
1522 {
1523 return (buf->b_hdr->b_size);
1524 }
1525
1526 /*
1527 * Evict buffers from list until we've removed the specified number of
1528 * bytes. Move the removed buffers to the appropriate evict state.
1529 * If the recycle flag is set, then attempt to "recycle" a buffer:
1530 * - look for a buffer to evict that is `bytes' long.
1531 * - return the data block from this buffer rather than freeing it.
1532 * This flag is used by callers that are trying to make space for a
1533 * new buffer in a full arc cache.
1534 *
1535 * This function makes a "best effort". It skips over any buffers
1536 * it can't get a hash_lock on, and so may not catch all candidates.
1537 * It may also return without evicting as much space as requested.
1538 */
1539 static void *
1540 arc_evict(arc_state_t *state, uint64_t spa, int64_t bytes, boolean_t recycle,
1541 arc_buf_contents_t type)
1542 {
1543 arc_state_t *evicted_state;
1544 uint64_t bytes_evicted = 0, skipped = 0, missed = 0;
1545 arc_buf_hdr_t *ab, *ab_prev = NULL;
1546 list_t *list = &state->arcs_list[type];
1547 kmutex_t *hash_lock;
1548 boolean_t have_lock;
1549 void *stolen = NULL;
1550
1551 ASSERT(state == arc_mru || state == arc_mfu);
1552
1553 evicted_state = (state == arc_mru) ? arc_mru_ghost : arc_mfu_ghost;
1554
1555 mutex_enter(&state->arcs_mtx);
1556 mutex_enter(&evicted_state->arcs_mtx);
1557
1558 for (ab = list_tail(list); ab; ab = ab_prev) {
1559 ab_prev = list_prev(list, ab);
1560 /* prefetch buffers have a minimum lifespan */
1561 if (HDR_IO_IN_PROGRESS(ab) ||
1562 (spa && ab->b_spa != spa) ||
1563 (ab->b_flags & (ARC_PREFETCH|ARC_INDIRECT) &&
1564 lbolt - ab->b_arc_access < arc_min_prefetch_lifespan)) {
1565 skipped++;
1566 continue;
1567 }
1568 /* "lookahead" for better eviction candidate */
1569 if (recycle && ab->b_size != bytes &&
1570 ab_prev && ab_prev->b_size == bytes)
1571 continue;
1572 hash_lock = HDR_LOCK(ab);
1573 have_lock = MUTEX_HELD(hash_lock);
1574 if (have_lock || mutex_tryenter(hash_lock)) {
1575 ASSERT3U(refcount_count(&ab->b_refcnt), ==, 0);
1576 ASSERT(ab->b_datacnt > 0);
1577 while (ab->b_buf) {
1578 arc_buf_t *buf = ab->b_buf;
1579 if (!rw_tryenter(&buf->b_lock, RW_WRITER)) {
1580 missed += 1;
1581 break;
1582 }
1583 if (buf->b_data) {
1584 bytes_evicted += ab->b_size;
1585 if (recycle && ab->b_type == type &&
1586 ab->b_size == bytes &&
1587 !HDR_L2_WRITING(ab)) {
1588 stolen = buf->b_data;
1589 recycle = FALSE;
1590 }
1591 }
1592 if (buf->b_efunc) {
1593 mutex_enter(&arc_eviction_mtx);
1594 arc_buf_destroy(buf,
1595 buf->b_data == stolen, FALSE);
1596 ab->b_buf = buf->b_next;
1597 buf->b_hdr = &arc_eviction_hdr;
1598 buf->b_next = arc_eviction_list;
1599 arc_eviction_list = buf;
1600 mutex_exit(&arc_eviction_mtx);
1601 rw_exit(&buf->b_lock);
1602 } else {
1603 rw_exit(&buf->b_lock);
1604 arc_buf_destroy(buf,
1605 buf->b_data == stolen, TRUE);
1606 }
1607 }
1608 if (ab->b_datacnt == 0) {
1609 arc_change_state(evicted_state, ab, hash_lock);
1610 ASSERT(HDR_IN_HASH_TABLE(ab));
1611 ab->b_flags |= ARC_IN_HASH_TABLE;
1612 ab->b_flags &= ~ARC_BUF_AVAILABLE;
1613 DTRACE_PROBE1(arc__evict, arc_buf_hdr_t *, ab);
1614 }
1615 if (!have_lock)
1616 mutex_exit(hash_lock);
1617 if (bytes >= 0 && bytes_evicted >= bytes)
1618 break;
1619 } else {
1620 missed += 1;
1621 }
1622 }
1623
1624 mutex_exit(&evicted_state->arcs_mtx);
1625 mutex_exit(&state->arcs_mtx);
1626
1627 if (bytes_evicted < bytes)
1628 dprintf("only evicted %lld bytes from %x",
1629 (longlong_t)bytes_evicted, state);
1630
1631 if (skipped)
1632 ARCSTAT_INCR(arcstat_evict_skip, skipped);
1633
1634 if (missed)
1635 ARCSTAT_INCR(arcstat_mutex_miss, missed);
1636
1637 /*
1638 * We have just evicted some date into the ghost state, make
1639 * sure we also adjust the ghost state size if necessary.
1640 */
1641 if (arc_no_grow &&
1642 arc_mru_ghost->arcs_size + arc_mfu_ghost->arcs_size > arc_c) {
1643 int64_t mru_over = arc_anon->arcs_size + arc_mru->arcs_size +
1644 arc_mru_ghost->arcs_size - arc_c;
1645
1646 if (mru_over > 0 && arc_mru_ghost->arcs_lsize[type] > 0) {
1647 int64_t todelete =
1648 MIN(arc_mru_ghost->arcs_lsize[type], mru_over);
1649 arc_evict_ghost(arc_mru_ghost, NULL, todelete);
1650 } else if (arc_mfu_ghost->arcs_lsize[type] > 0) {
1651 int64_t todelete = MIN(arc_mfu_ghost->arcs_lsize[type],
1652 arc_mru_ghost->arcs_size +
1653 arc_mfu_ghost->arcs_size - arc_c);
1654 arc_evict_ghost(arc_mfu_ghost, NULL, todelete);
1655 }
1656 }
1657
1658 return (stolen);
1659 }
1660
1661 /*
1662 * Remove buffers from list until we've removed the specified number of
1663 * bytes. Destroy the buffers that are removed.
1664 */
1665 static void
1666 arc_evict_ghost(arc_state_t *state, uint64_t spa, int64_t bytes)
1667 {
1668 arc_buf_hdr_t *ab, *ab_prev;
1669 list_t *list = &state->arcs_list[ARC_BUFC_DATA];
1670 kmutex_t *hash_lock;
1671 uint64_t bytes_deleted = 0;
1672 uint64_t bufs_skipped = 0;
1673
1674 ASSERT(GHOST_STATE(state));
1675 top:
1676 mutex_enter(&state->arcs_mtx);
1677 for (ab = list_tail(list); ab; ab = ab_prev) {
1678 ab_prev = list_prev(list, ab);
1679 if (spa && ab->b_spa != spa)
1680 continue;
1681 hash_lock = HDR_LOCK(ab);
1682 if (mutex_tryenter(hash_lock)) {
1683 ASSERT(!HDR_IO_IN_PROGRESS(ab));
1684 ASSERT(ab->b_buf == NULL);
1685 ARCSTAT_BUMP(arcstat_deleted);
1686 bytes_deleted += ab->b_size;
1687
1688 if (ab->b_l2hdr != NULL) {
1689 /*
1690 * This buffer is cached on the 2nd Level ARC;
1691 * don't destroy the header.
1692 */
1693 arc_change_state(arc_l2c_only, ab, hash_lock);
1694 mutex_exit(hash_lock);
1695 } else {
1696 arc_change_state(arc_anon, ab, hash_lock);
1697 mutex_exit(hash_lock);
1698 arc_hdr_destroy(ab);
1699 }
1700
1701 DTRACE_PROBE1(arc__delete, arc_buf_hdr_t *, ab);
1702 if (bytes >= 0 && bytes_deleted >= bytes)
1703 break;
1704 } else {
1705 if (bytes < 0) {
1706 mutex_exit(&state->arcs_mtx);
1707 mutex_enter(hash_lock);
1708 mutex_exit(hash_lock);
1709 goto top;
1710 }
1711 bufs_skipped += 1;
1712 }
1713 }
1714 mutex_exit(&state->arcs_mtx);
1715
1716 if (list == &state->arcs_list[ARC_BUFC_DATA] &&
1717 (bytes < 0 || bytes_deleted < bytes)) {
1718 list = &state->arcs_list[ARC_BUFC_METADATA];
1719 goto top;
1720 }
1721
1722 if (bufs_skipped) {
1723 ARCSTAT_INCR(arcstat_mutex_miss, bufs_skipped);
1724 ASSERT(bytes >= 0);
1725 }
1726
1727 if (bytes_deleted < bytes)
1728 dprintf("only deleted %lld bytes from %p",
1729 (longlong_t)bytes_deleted, state);
1730 }
1731
1732 static void
1733 arc_adjust(void)
1734 {
1735 int64_t adjustment, delta;
1736
1737 /*
1738 * Adjust MRU size
1739 */
1740
1741 adjustment = MIN(arc_size - arc_c,
1742 arc_anon->arcs_size + arc_mru->arcs_size + arc_meta_used - arc_p);
1743
1744 if (adjustment > 0 && arc_mru->arcs_lsize[ARC_BUFC_DATA] > 0) {
1745 delta = MIN(arc_mru->arcs_lsize[ARC_BUFC_DATA], adjustment);
1746 (void) arc_evict(arc_mru, NULL, delta, FALSE, ARC_BUFC_DATA);
1747 adjustment -= delta;
1748 }
1749
1750 if (adjustment > 0 && arc_mru->arcs_lsize[ARC_BUFC_METADATA] > 0) {
1751 delta = MIN(arc_mru->arcs_lsize[ARC_BUFC_METADATA], adjustment);
1752 (void) arc_evict(arc_mru, NULL, delta, FALSE,
1753 ARC_BUFC_METADATA);
1754 }
1755
1756 /*
1757 * Adjust MFU size
1758 */
1759
1760 adjustment = arc_size - arc_c;
1761
1762 if (adjustment > 0 && arc_mfu->arcs_lsize[ARC_BUFC_DATA] > 0) {
1763 delta = MIN(adjustment, arc_mfu->arcs_lsize[ARC_BUFC_DATA]);
1764 (void) arc_evict(arc_mfu, NULL, delta, FALSE, ARC_BUFC_DATA);
1765 adjustment -= delta;
1766 }
1767
1768 if (adjustment > 0 && arc_mfu->arcs_lsize[ARC_BUFC_METADATA] > 0) {
1769 int64_t delta = MIN(adjustment,
1770 arc_mfu->arcs_lsize[ARC_BUFC_METADATA]);
1771 (void) arc_evict(arc_mfu, NULL, delta, FALSE,
1772 ARC_BUFC_METADATA);
1773 }
1774
1775 /*
1776 * Adjust ghost lists
1777 */
1778
1779 adjustment = arc_mru->arcs_size + arc_mru_ghost->arcs_size - arc_c;
1780
1781 if (adjustment > 0 && arc_mru_ghost->arcs_size > 0) {
1782 delta = MIN(arc_mru_ghost->arcs_size, adjustment);
1783 arc_evict_ghost(arc_mru_ghost, NULL, delta);
1784 }
1785
1786 adjustment =
1787 arc_mru_ghost->arcs_size + arc_mfu_ghost->arcs_size - arc_c;
1788
1789 if (adjustment > 0 && arc_mfu_ghost->arcs_size > 0) {
1790 delta = MIN(arc_mfu_ghost->arcs_size, adjustment);
1791 arc_evict_ghost(arc_mfu_ghost, NULL, delta);
1792 }
1793 }
1794
1795 static void
1796 arc_do_user_evicts(void)
1797 {
1798 mutex_enter(&arc_eviction_mtx);
1799 while (arc_eviction_list != NULL) {
1800 arc_buf_t *buf = arc_eviction_list;
1801 arc_eviction_list = buf->b_next;
1802 rw_enter(&buf->b_lock, RW_WRITER);
1803 buf->b_hdr = NULL;
1804 rw_exit(&buf->b_lock);
1805 mutex_exit(&arc_eviction_mtx);
1806
1807 if (buf->b_efunc != NULL)
1808 VERIFY(buf->b_efunc(buf) == 0);
1809
1810 buf->b_efunc = NULL;
1811 buf->b_private = NULL;
1812 kmem_cache_free(buf_cache, buf);
1813 mutex_enter(&arc_eviction_mtx);
1814 }
1815 mutex_exit(&arc_eviction_mtx);
1816 }
1817
1818 /*
1819 * Flush all *evictable* data from the cache for the given spa.
1820 * NOTE: this will not touch "active" (i.e. referenced) data.
1821 */
1822 void
1823 arc_flush(spa_t *spa)
1824 {
1825 uint64_t guid = 0;
1826
1827 if (spa)
1828 guid = spa_guid(spa);
1829
1830 while (list_head(&arc_mru->arcs_list[ARC_BUFC_DATA])) {
1831 (void) arc_evict(arc_mru, guid, -1, FALSE, ARC_BUFC_DATA);
1832 if (spa)
1833 break;
1834 }
1835 while (list_head(&arc_mru->arcs_list[ARC_BUFC_METADATA])) {
1836 (void) arc_evict(arc_mru, guid, -1, FALSE, ARC_BUFC_METADATA);
1837 if (spa)
1838 break;
1839 }
1840 while (list_head(&arc_mfu->arcs_list[ARC_BUFC_DATA])) {
1841 (void) arc_evict(arc_mfu, guid, -1, FALSE, ARC_BUFC_DATA);
1842 if (spa)
1843 break;
1844 }
1845 while (list_head(&arc_mfu->arcs_list[ARC_BUFC_METADATA])) {
1846 (void) arc_evict(arc_mfu, guid, -1, FALSE, ARC_BUFC_METADATA);
1847 if (spa)
1848 break;
1849 }
1850
1851 arc_evict_ghost(arc_mru_ghost, guid, -1);
1852 arc_evict_ghost(arc_mfu_ghost, guid, -1);
1853
1854 mutex_enter(&arc_reclaim_thr_lock);
1855 arc_do_user_evicts();
1856 mutex_exit(&arc_reclaim_thr_lock);
1857 ASSERT(spa || arc_eviction_list == NULL);
1858 }
1859
1860 void
1861 arc_shrink(void)
1862 {
1863 if (arc_c > arc_c_min) {
1864 uint64_t to_free;
1865
1866 #ifdef _KERNEL
1867 to_free = MAX(arc_c >> arc_shrink_shift, ptob(needfree));
1868 #else
1869 to_free = arc_c >> arc_shrink_shift;
1870 #endif
1871 if (arc_c > arc_c_min + to_free)
1872 atomic_add_64(&arc_c, -to_free);
1873 else
1874 arc_c = arc_c_min;
1875
1876 atomic_add_64(&arc_p, -(arc_p >> arc_shrink_shift));
1877 if (arc_c > arc_size)
1878 arc_c = MAX(arc_size, arc_c_min);
1879 if (arc_p > arc_c)
1880 arc_p = (arc_c >> 1);
1881 ASSERT(arc_c >= arc_c_min);
1882 ASSERT((int64_t)arc_p >= 0);
1883 }
1884
1885 if (arc_size > arc_c)
1886 arc_adjust();
1887 }
1888
1889 static int
1890 arc_reclaim_needed(void)
1891 {
1892 uint64_t extra;
1893
1894 #ifdef _KERNEL
1895
1896 if (needfree)
1897 return (1);
1898
1899 /*
1900 * take 'desfree' extra pages, so we reclaim sooner, rather than later
1901 */
1902 extra = desfree;
1903
1904 /*
1905 * check that we're out of range of the pageout scanner. It starts to
1906 * schedule paging if freemem is less than lotsfree and needfree.
1907 * lotsfree is the high-water mark for pageout, and needfree is the
1908 * number of needed free pages. We add extra pages here to make sure
1909 * the scanner doesn't start up while we're freeing memory.
1910 */
1911 if (freemem < lotsfree + needfree + extra)
1912 return (1);
1913
1914 /*
1915 * check to make sure that swapfs has enough space so that anon
1916 * reservations can still succeed. anon_resvmem() checks that the
1917 * availrmem is greater than swapfs_minfree, and the number of reserved
1918 * swap pages. We also add a bit of extra here just to prevent
1919 * circumstances from getting really dire.
1920 */
1921 if (availrmem < swapfs_minfree + swapfs_reserve + extra)
1922 return (1);
1923
1924 #if defined(__i386)
1925 /*
1926 * If we're on an i386 platform, it's possible that we'll exhaust the
1927 * kernel heap space before we ever run out of available physical
1928 * memory. Most checks of the size of the heap_area compare against
1929 * tune.t_minarmem, which is the minimum available real memory that we
1930 * can have in the system. However, this is generally fixed at 25 pages
1931 * which is so low that it's useless. In this comparison, we seek to
1932 * calculate the total heap-size, and reclaim if more than 3/4ths of the
1933 * heap is allocated. (Or, in the calculation, if less than 1/4th is
1934 * free)
1935 */
1936 if (btop(vmem_size(heap_arena, VMEM_FREE)) <
1937 (btop(vmem_size(heap_arena, VMEM_FREE | VMEM_ALLOC)) >> 2))
1938 return (1);
1939 #endif
1940
1941 #else
1942 if (spa_get_random(100) == 0)
1943 return (1);
1944 #endif
1945 return (0);
1946 }
1947
1948 static void
1949 arc_kmem_reap_now(arc_reclaim_strategy_t strat)
1950 {
1951 size_t i;
1952 kmem_cache_t *prev_cache = NULL;
1953 kmem_cache_t *prev_data_cache = NULL;
1954 extern kmem_cache_t *zio_buf_cache[];
1955 extern kmem_cache_t *zio_data_buf_cache[];
1956
1957 #ifdef _KERNEL
1958 if (arc_meta_used >= arc_meta_limit) {
1959 /*
1960 * We are exceeding our meta-data cache limit.
1961 * Purge some DNLC entries to release holds on meta-data.
1962 */
1963 dnlc_reduce_cache((void *)(uintptr_t)arc_reduce_dnlc_percent);
1964 }
1965 #if defined(__i386)
1966 /*
1967 * Reclaim unused memory from all kmem caches.
1968 */
1969 kmem_reap();
1970 #endif
1971 #endif
1972
1973 /*
1974 * An aggressive reclamation will shrink the cache size as well as
1975 * reap free buffers from the arc kmem caches.
1976 */
1977 if (strat == ARC_RECLAIM_AGGR)
1978 arc_shrink();
1979
1980 for (i = 0; i < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; i++) {
1981 if (zio_buf_cache[i] != prev_cache) {
1982 prev_cache = zio_buf_cache[i];
1983 kmem_cache_reap_now(zio_buf_cache[i]);
1984 }
1985 if (zio_data_buf_cache[i] != prev_data_cache) {
1986 prev_data_cache = zio_data_buf_cache[i];
1987 kmem_cache_reap_now(zio_data_buf_cache[i]);
1988 }
1989 }
1990 kmem_cache_reap_now(buf_cache);
1991 kmem_cache_reap_now(hdr_cache);
1992 }
1993
1994 static void
1995 arc_reclaim_thread(void)
1996 {
1997 clock_t growtime = 0;
1998 arc_reclaim_strategy_t last_reclaim = ARC_RECLAIM_CONS;
1999 callb_cpr_t cpr;
2000
2001 CALLB_CPR_INIT(&cpr, &arc_reclaim_thr_lock, callb_generic_cpr, FTAG);
2002
2003 mutex_enter(&arc_reclaim_thr_lock);
2004 while (arc_thread_exit == 0) {
2005 if (arc_reclaim_needed()) {
2006
2007 if (arc_no_grow) {
2008 if (last_reclaim == ARC_RECLAIM_CONS) {
2009 last_reclaim = ARC_RECLAIM_AGGR;
2010 } else {
2011 last_reclaim = ARC_RECLAIM_CONS;
2012 }
2013 } else {
2014 arc_no_grow = TRUE;
2015 last_reclaim = ARC_RECLAIM_AGGR;
2016 membar_producer();
2017 }
2018
2019 /* reset the growth delay for every reclaim */
2020 growtime = lbolt + (arc_grow_retry * hz);
2021
2022 arc_kmem_reap_now(last_reclaim);
2023 arc_warm = B_TRUE;
2024
2025 } else if (arc_no_grow && lbolt >= growtime) {
2026 arc_no_grow = FALSE;
2027 }
2028
2029 if (2 * arc_c < arc_size +
2030 arc_mru_ghost->arcs_size + arc_mfu_ghost->arcs_size)
2031 arc_adjust();
2032
2033 if (arc_eviction_list != NULL)
2034 arc_do_user_evicts();
2035
2036 /* block until needed, or one second, whichever is shorter */
2037 CALLB_CPR_SAFE_BEGIN(&cpr);
2038 (void) cv_timedwait(&arc_reclaim_thr_cv,
2039 &arc_reclaim_thr_lock, (lbolt + hz));
2040 CALLB_CPR_SAFE_END(&cpr, &arc_reclaim_thr_lock);
2041 }
2042
2043 arc_thread_exit = 0;
2044 cv_broadcast(&arc_reclaim_thr_cv);
2045 CALLB_CPR_EXIT(&cpr); /* drops arc_reclaim_thr_lock */
2046 thread_exit();
2047 }
2048
2049 /*
2050 * Adapt arc info given the number of bytes we are trying to add and
2051 * the state that we are comming from. This function is only called
2052 * when we are adding new content to the cache.
2053 */
2054 static void
2055 arc_adapt(int bytes, arc_state_t *state)
2056 {
2057 int mult;
2058 uint64_t arc_p_min = (arc_c >> arc_p_min_shift);
2059
2060 if (state == arc_l2c_only)
2061 return;
2062
2063 ASSERT(bytes > 0);
2064 /*
2065 * Adapt the target size of the MRU list:
2066 * - if we just hit in the MRU ghost list, then increase
2067 * the target size of the MRU list.
2068 * - if we just hit in the MFU ghost list, then increase
2069 * the target size of the MFU list by decreasing the
2070 * target size of the MRU list.
2071 */
2072 if (state == arc_mru_ghost) {
2073 mult = ((arc_mru_ghost->arcs_size >= arc_mfu_ghost->arcs_size) ?
2074 1 : (arc_mfu_ghost->arcs_size/arc_mru_ghost->arcs_size));
2075
2076 arc_p = MIN(arc_c - arc_p_min, arc_p + bytes * mult);
2077 } else if (state == arc_mfu_ghost) {
2078 uint64_t delta;
2079
2080 mult = ((arc_mfu_ghost->arcs_size >= arc_mru_ghost->arcs_size) ?
2081 1 : (arc_mru_ghost->arcs_size/arc_mfu_ghost->arcs_size));
2082
2083 delta = MIN(bytes * mult, arc_p);
2084 arc_p = MAX(arc_p_min, arc_p - delta);
2085 }
2086 ASSERT((int64_t)arc_p >= 0);
2087
2088 if (arc_reclaim_needed()) {
2089 cv_signal(&arc_reclaim_thr_cv);
2090 return;
2091 }
2092
2093 if (arc_no_grow)
2094 return;
2095
2096 if (arc_c >= arc_c_max)
2097 return;
2098
2099 /*
2100 * If we're within (2 * maxblocksize) bytes of the target
2101 * cache size, increment the target cache size
2102 */
2103 if (arc_size > arc_c - (2ULL << SPA_MAXBLOCKSHIFT)) {
2104 atomic_add_64(&arc_c, (int64_t)bytes);
2105 if (arc_c > arc_c_max)
2106 arc_c = arc_c_max;
2107 else if (state == arc_anon)
2108 atomic_add_64(&arc_p, (int64_t)bytes);
2109 if (arc_p > arc_c)
2110 arc_p = arc_c;
2111 }
2112 ASSERT((int64_t)arc_p >= 0);
2113 }
2114
2115 /*
2116 * Check if the cache has reached its limits and eviction is required
2117 * prior to insert.
2118 */
2119 static int
2120 arc_evict_needed(arc_buf_contents_t type)
2121 {
2122 if (type == ARC_BUFC_METADATA && arc_meta_used >= arc_meta_limit)
2123 return (1);
2124
2125 #ifdef _KERNEL
2126 /*
2127 * If zio data pages are being allocated out of a separate heap segment,
2128 * then enforce that the size of available vmem for this area remains
2129 * above about 1/32nd free.
2130 */
2131 if (type == ARC_BUFC_DATA && zio_arena != NULL &&
2132 vmem_size(zio_arena, VMEM_FREE) <
2133 (vmem_size(zio_arena, VMEM_ALLOC) >> 5))
2134 return (1);
2135 #endif
2136
2137 if (arc_reclaim_needed())
2138 return (1);
2139
2140 return (arc_size > arc_c);
2141 }
2142
2143 /*
2144 * The buffer, supplied as the first argument, needs a data block.
2145 * So, if we are at cache max, determine which cache should be victimized.
2146 * We have the following cases:
2147 *
2148 * 1. Insert for MRU, p > sizeof(arc_anon + arc_mru) ->
2149 * In this situation if we're out of space, but the resident size of the MFU is
2150 * under the limit, victimize the MFU cache to satisfy this insertion request.
2151 *
2152 * 2. Insert for MRU, p <= sizeof(arc_anon + arc_mru) ->
2153 * Here, we've used up all of the available space for the MRU, so we need to
2154 * evict from our own cache instead. Evict from the set of resident MRU
2155 * entries.
2156 *
2157 * 3. Insert for MFU (c - p) > sizeof(arc_mfu) ->
2158 * c minus p represents the MFU space in the cache, since p is the size of the
2159 * cache that is dedicated to the MRU. In this situation there's still space on
2160 * the MFU side, so the MRU side needs to be victimized.
2161 *
2162 * 4. Insert for MFU (c - p) < sizeof(arc_mfu) ->
2163 * MFU's resident set is consuming more space than it has been allotted. In
2164 * this situation, we must victimize our own cache, the MFU, for this insertion.
2165 */
2166 static void
2167 arc_get_data_buf(arc_buf_t *buf)
2168 {
2169 arc_state_t *state = buf->b_hdr->b_state;
2170 uint64_t size = buf->b_hdr->b_size;
2171 arc_buf_contents_t type = buf->b_hdr->b_type;
2172
2173 arc_adapt(size, state);
2174
2175 /*
2176 * We have not yet reached cache maximum size,
2177 * just allocate a new buffer.
2178 */
2179 if (!arc_evict_needed(type)) {
2180 if (type == ARC_BUFC_METADATA) {
2181 buf->b_data = zio_buf_alloc(size);
2182 arc_space_consume(size, ARC_SPACE_DATA);
2183 } else {
2184 ASSERT(type == ARC_BUFC_DATA);
2185 buf->b_data = zio_data_buf_alloc(size);
2186 ARCSTAT_INCR(arcstat_data_size, size);
2187 atomic_add_64(&arc_size, size);
2188 }
2189 goto out;
2190 }
2191
2192 /*
2193 * If we are prefetching from the mfu ghost list, this buffer
2194 * will end up on the mru list; so steal space from there.
2195 */
2196 if (state == arc_mfu_ghost)
2197 state = buf->b_hdr->b_flags & ARC_PREFETCH ? arc_mru : arc_mfu;
2198 else if (state == arc_mru_ghost)
2199 state = arc_mru;
2200
2201 if (state == arc_mru || state == arc_anon) {
2202 uint64_t mru_used = arc_anon->arcs_size + arc_mru->arcs_size;
2203 state = (arc_mfu->arcs_lsize[type] >= size &&
2204 arc_p > mru_used) ? arc_mfu : arc_mru;
2205 } else {
2206 /* MFU cases */
2207 uint64_t mfu_space = arc_c - arc_p;
2208 state = (arc_mru->arcs_lsize[type] >= size &&
2209 mfu_space > arc_mfu->arcs_size) ? arc_mru : arc_mfu;
2210 }
2211 if ((buf->b_data = arc_evict(state, NULL, size, TRUE, type)) == NULL) {
2212 if (type == ARC_BUFC_METADATA) {
2213 buf->b_data = zio_buf_alloc(size);
2214 arc_space_consume(size, ARC_SPACE_DATA);
2215 } else {
2216 ASSERT(type == ARC_BUFC_DATA);
2217 buf->b_data = zio_data_buf_alloc(size);
2218 ARCSTAT_INCR(arcstat_data_size, size);
2219 atomic_add_64(&arc_size, size);
2220 }
2221 ARCSTAT_BUMP(arcstat_recycle_miss);
2222 }
2223 ASSERT(buf->b_data != NULL);
2224 out:
2225 /*
2226 * Update the state size. Note that ghost states have a
2227 * "ghost size" and so don't need to be updated.
2228 */
2229 if (!GHOST_STATE(buf->b_hdr->b_state)) {
2230 arc_buf_hdr_t *hdr = buf->b_hdr;
2231
2232 atomic_add_64(&hdr->b_state->arcs_size, size);
2233 if (list_link_active(&hdr->b_arc_node)) {
2234 ASSERT(refcount_is_zero(&hdr->b_refcnt));
2235 atomic_add_64(&hdr->b_state->arcs_lsize[type], size);
2236 }
2237 /*
2238 * If we are growing the cache, and we are adding anonymous
2239 * data, and we have outgrown arc_p, update arc_p
2240 */
2241 if (arc_size < arc_c && hdr->b_state == arc_anon &&
2242 arc_anon->arcs_size + arc_mru->arcs_size > arc_p)
2243 arc_p = MIN(arc_c, arc_p + size);
2244 }
2245 }
2246
2247 /*
2248 * This routine is called whenever a buffer is accessed.
2249 * NOTE: the hash lock is dropped in this function.
2250 */
2251 static void
2252 arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock)
2253 {
2254 ASSERT(MUTEX_HELD(hash_lock));
2255
2256 if (buf->b_state == arc_anon) {
2257 /*
2258 * This buffer is not in the cache, and does not
2259 * appear in our "ghost" list. Add the new buffer
2260 * to the MRU state.
2261 */
2262
2263 ASSERT(buf->b_arc_access == 0);
2264 buf->b_arc_access = lbolt;
2265 DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, buf);
2266 arc_change_state(arc_mru, buf, hash_lock);
2267
2268 } else if (buf->b_state == arc_mru) {
2269 /*
2270 * If this buffer is here because of a prefetch, then either:
2271 * - clear the flag if this is a "referencing" read
2272 * (any subsequent access will bump this into the MFU state).
2273 * or
2274 * - move the buffer to the head of the list if this is
2275 * another prefetch (to make it less likely to be evicted).
2276 */
2277 if ((buf->b_flags & ARC_PREFETCH) != 0) {
2278 if (refcount_count(&buf->b_refcnt) == 0) {
2279 ASSERT(list_link_active(&buf->b_arc_node));
2280 } else {
2281 buf->b_flags &= ~ARC_PREFETCH;
2282 ARCSTAT_BUMP(arcstat_mru_hits);
2283 }
2284 buf->b_arc_access = lbolt;
2285 return;
2286 }
2287
2288 /*
2289 * This buffer has been "accessed" only once so far,
2290 * but it is still in the cache. Move it to the MFU
2291 * state.
2292 */
2293 if (lbolt > buf->b_arc_access + ARC_MINTIME) {
2294 /*
2295 * More than 125ms have passed since we
2296 * instantiated this buffer. Move it to the
2297 * most frequently used state.
2298 */
2299 buf->b_arc_access = lbolt;
2300 DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf);
2301 arc_change_state(arc_mfu, buf, hash_lock);
2302 }
2303 ARCSTAT_BUMP(arcstat_mru_hits);
2304 } else if (buf->b_state == arc_mru_ghost) {
2305 arc_state_t *new_state;
2306 /*
2307 * This buffer has been "accessed" recently, but
2308 * was evicted from the cache. Move it to the
2309 * MFU state.
2310 */
2311
2312 if (buf->b_flags & ARC_PREFETCH) {
2313 new_state = arc_mru;
2314 if (refcount_count(&buf->b_refcnt) > 0)
2315 buf->b_flags &= ~ARC_PREFETCH;
2316 DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, buf);
2317 } else {
2318 new_state = arc_mfu;
2319 DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf);
2320 }
2321
2322 buf->b_arc_access = lbolt;
2323 arc_change_state(new_state, buf, hash_lock);
2324
2325 ARCSTAT_BUMP(arcstat_mru_ghost_hits);
2326 } else if (buf->b_state == arc_mfu) {
2327 /*
2328 * This buffer has been accessed more than once and is
2329 * still in the cache. Keep it in the MFU state.
2330 *
2331 * NOTE: an add_reference() that occurred when we did
2332 * the arc_read() will have kicked this off the list.
2333 * If it was a prefetch, we will explicitly move it to
2334 * the head of the list now.
2335 */
2336 if ((buf->b_flags & ARC_PREFETCH) != 0) {
2337 ASSERT(refcount_count(&buf->b_refcnt) == 0);
2338 ASSERT(list_link_active(&buf->b_arc_node));
2339 }
2340 ARCSTAT_BUMP(arcstat_mfu_hits);
2341 buf->b_arc_access = lbolt;
2342 } else if (buf->b_state == arc_mfu_ghost) {
2343 arc_state_t *new_state = arc_mfu;
2344 /*
2345 * This buffer has been accessed more than once but has
2346 * been evicted from the cache. Move it back to the
2347 * MFU state.
2348 */
2349
2350 if (buf->b_flags & ARC_PREFETCH) {
2351 /*
2352 * This is a prefetch access...
2353 * move this block back to the MRU state.
2354 */
2355 ASSERT3U(refcount_count(&buf->b_refcnt), ==, 0);
2356 new_state = arc_mru;
2357 }
2358
2359 buf->b_arc_access = lbolt;
2360 DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf);
2361 arc_change_state(new_state, buf, hash_lock);
2362
2363 ARCSTAT_BUMP(arcstat_mfu_ghost_hits);
2364 } else if (buf->b_state == arc_l2c_only) {
2365 /*
2366 * This buffer is on the 2nd Level ARC.
2367 */
2368
2369 buf->b_arc_access = lbolt;
2370 DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf);
2371 arc_change_state(arc_mfu, buf, hash_lock);
2372 } else {
2373 ASSERT(!"invalid arc state");
2374 }
2375 }
2376
2377 /* a generic arc_done_func_t which you can use */
2378 /* ARGSUSED */
2379 void
2380 arc_bcopy_func(zio_t *zio, arc_buf_t *buf, void *arg)
2381 {
2382 bcopy(buf->b_data, arg, buf->b_hdr->b_size);
2383 VERIFY(arc_buf_remove_ref(buf, arg) == 1);
2384 }
2385
2386 /* a generic arc_done_func_t */
2387 void
2388 arc_getbuf_func(zio_t *zio, arc_buf_t *buf, void *arg)
2389 {
2390 arc_buf_t **bufp = arg;
2391 if (zio && zio->io_error) {
2392 VERIFY(arc_buf_remove_ref(buf, arg) == 1);
2393 *bufp = NULL;
2394 } else {
2395 *bufp = buf;
2396 }
2397 }
2398
2399 static void
2400 arc_read_done(zio_t *zio)
2401 {
2402 arc_buf_hdr_t *hdr, *found;
2403 arc_buf_t *buf;
2404 arc_buf_t *abuf; /* buffer we're assigning to callback */
2405 kmutex_t *hash_lock;
2406 arc_callback_t *callback_list, *acb;
2407 int freeable = FALSE;
2408
2409 buf = zio->io_private;
2410 hdr = buf->b_hdr;
2411
2412 /*
2413 * The hdr was inserted into hash-table and removed from lists
2414 * prior to starting I/O. We should find this header, since
2415 * it's in the hash table, and it should be legit since it's
2416 * not possible to evict it during the I/O. The only possible
2417 * reason for it not to be found is if we were freed during the
2418 * read.
2419 */
2420 found = buf_hash_find(hdr->b_spa, &hdr->b_dva, hdr->b_birth,
2421 &hash_lock);
2422
2423 ASSERT((found == NULL && HDR_FREED_IN_READ(hdr) && hash_lock == NULL) ||
2424 (found == hdr && DVA_EQUAL(&hdr->b_dva, BP_IDENTITY(zio->io_bp))) ||
2425 (found == hdr && HDR_L2_READING(hdr)));
2426
2427 hdr->b_flags &= ~ARC_L2_EVICTED;
2428 if (l2arc_noprefetch && (hdr->b_flags & ARC_PREFETCH))
2429 hdr->b_flags &= ~ARC_L2CACHE;
2430
2431 /* byteswap if necessary */
2432 callback_list = hdr->b_acb;
2433 ASSERT(callback_list != NULL);
2434 if (BP_SHOULD_BYTESWAP(zio->io_bp)) {
2435 arc_byteswap_func_t *func = BP_GET_LEVEL(zio->io_bp) > 0 ?
2436 byteswap_uint64_array :
2437 dmu_ot[BP_GET_TYPE(zio->io_bp)].ot_byteswap;
2438 func(buf->b_data, hdr->b_size);
2439 }
2440
2441 arc_cksum_compute(buf, B_FALSE);
2442
2443 /* create copies of the data buffer for the callers */
2444 abuf = buf;
2445 for (acb = callback_list; acb; acb = acb->acb_next) {
2446 if (acb->acb_done) {
2447 if (abuf == NULL)
2448 abuf = arc_buf_clone(buf);
2449 acb->acb_buf = abuf;
2450 abuf = NULL;
2451 }
2452 }
2453 hdr->b_acb = NULL;
2454 hdr->b_flags &= ~ARC_IO_IN_PROGRESS;
2455 ASSERT(!HDR_BUF_AVAILABLE(hdr));
2456 if (abuf == buf)
2457 hdr->b_flags |= ARC_BUF_AVAILABLE;
2458
2459 ASSERT(refcount_is_zero(&hdr->b_refcnt) || callback_list != NULL);
2460
2461 if (zio->io_error != 0) {
2462 hdr->b_flags |= ARC_IO_ERROR;
2463 if (hdr->b_state != arc_anon)
2464 arc_change_state(arc_anon, hdr, hash_lock);
2465 if (HDR_IN_HASH_TABLE(hdr))
2466 buf_hash_remove(hdr);
2467 freeable = refcount_is_zero(&hdr->b_refcnt);
2468 }
2469
2470 /*
2471 * Broadcast before we drop the hash_lock to avoid the possibility
2472 * that the hdr (and hence the cv) might be freed before we get to
2473 * the cv_broadcast().
2474 */
2475 cv_broadcast(&hdr->b_cv);
2476
2477 if (hash_lock) {
2478 /*
2479 * Only call arc_access on anonymous buffers. This is because
2480 * if we've issued an I/O for an evicted buffer, we've already
2481 * called arc_access (to prevent any simultaneous readers from
2482 * getting confused).
2483 */
2484 if (zio->io_error == 0 && hdr->b_state == arc_anon)
2485 arc_access(hdr, hash_lock);
2486 mutex_exit(hash_lock);
2487 } else {
2488 /*
2489 * This block was freed while we waited for the read to
2490 * complete. It has been removed from the hash table and
2491 * moved to the anonymous state (so that it won't show up
2492 * in the cache).
2493 */
2494 ASSERT3P(hdr->b_state, ==, arc_anon);
2495 freeable = refcount_is_zero(&hdr->b_refcnt);
2496 }
2497
2498 /* execute each callback and free its structure */
2499 while ((acb = callback_list) != NULL) {
2500 if (acb->acb_done)
2501 acb->acb_done(zio, acb->acb_buf, acb->acb_private);
2502
2503 if (acb->acb_zio_dummy != NULL) {
2504 acb->acb_zio_dummy->io_error = zio->io_error;
2505 zio_nowait(acb->acb_zio_dummy);
2506 }
2507
2508 callback_list = acb->acb_next;
2509 kmem_free(acb, sizeof (arc_callback_t));
2510 }
2511
2512 if (freeable)
2513 arc_hdr_destroy(hdr);
2514 }
2515
2516 /*
2517 * "Read" the block block at the specified DVA (in bp) via the
2518 * cache. If the block is found in the cache, invoke the provided
2519 * callback immediately and return. Note that the `zio' parameter
2520 * in the callback will be NULL in this case, since no IO was
2521 * required. If the block is not in the cache pass the read request
2522 * on to the spa with a substitute callback function, so that the
2523 * requested block will be added to the cache.
2524 *
2525 * If a read request arrives for a block that has a read in-progress,
2526 * either wait for the in-progress read to complete (and return the
2527 * results); or, if this is a read with a "done" func, add a record
2528 * to the read to invoke the "done" func when the read completes,
2529 * and return; or just return.
2530 *
2531 * arc_read_done() will invoke all the requested "done" functions
2532 * for readers of this block.
2533 *
2534 * Normal callers should use arc_read and pass the arc buffer and offset
2535 * for the bp. But if you know you don't need locking, you can use
2536 * arc_read_bp.
2537 */
2538 int
2539 arc_read(zio_t *pio, spa_t *spa, blkptr_t *bp, arc_buf_t *pbuf,
2540 arc_done_func_t *done, void *private, int priority, int zio_flags,
2541 uint32_t *arc_flags, const zbookmark_t *zb)
2542 {
2543 int err;
2544
2545 ASSERT(!refcount_is_zero(&pbuf->b_hdr->b_refcnt));
2546 ASSERT3U((char *)bp - (char *)pbuf->b_data, <, pbuf->b_hdr->b_size);
2547 rw_enter(&pbuf->b_lock, RW_READER);
2548
2549 err = arc_read_nolock(pio, spa, bp, done, private, priority,
2550 zio_flags, arc_flags, zb);
2551 rw_exit(&pbuf->b_lock);
2552
2553 return (err);
2554 }
2555
2556 int
2557 arc_read_nolock(zio_t *pio, spa_t *spa, blkptr_t *bp,
2558 arc_done_func_t *done, void *private, int priority, int zio_flags,
2559 uint32_t *arc_flags, const zbookmark_t *zb)
2560 {
2561 arc_buf_hdr_t *hdr;
2562 arc_buf_t *buf;
2563 kmutex_t *hash_lock;
2564 zio_t *rzio;
2565 uint64_t guid = spa_guid(spa);
2566
2567 top:
2568 hdr = buf_hash_find(guid, BP_IDENTITY(bp), bp->blk_birth, &hash_lock);
2569 if (hdr && hdr->b_datacnt > 0) {
2570
2571 *arc_flags |= ARC_CACHED;
2572
2573 if (HDR_IO_IN_PROGRESS(hdr)) {
2574
2575 if (*arc_flags & ARC_WAIT) {
2576 cv_wait(&hdr->b_cv, hash_lock);
2577 mutex_exit(hash_lock);
2578 goto top;
2579 }
2580 ASSERT(*arc_flags & ARC_NOWAIT);
2581
2582 if (done) {
2583 arc_callback_t *acb = NULL;
2584
2585 acb = kmem_zalloc(sizeof (arc_callback_t),
2586 KM_SLEEP);
2587 acb->acb_done = done;
2588 acb->acb_private = private;
2589 if (pio != NULL)
2590 acb->acb_zio_dummy = zio_null(pio,
2591 spa, NULL, NULL, NULL, zio_flags);
2592
2593 ASSERT(acb->acb_done != NULL);
2594 acb->acb_next = hdr->b_acb;
2595 hdr->b_acb = acb;
2596 add_reference(hdr, hash_lock, private);
2597 mutex_exit(hash_lock);
2598 return (0);
2599 }
2600 mutex_exit(hash_lock);
2601 return (0);
2602 }
2603
2604 ASSERT(hdr->b_state == arc_mru || hdr->b_state == arc_mfu);
2605
2606 if (done) {
2607 add_reference(hdr, hash_lock, private);
2608 /*
2609 * If this block is already in use, create a new
2610 * copy of the data so that we will be guaranteed
2611 * that arc_release() will always succeed.
2612 */
2613 buf = hdr->b_buf;
2614 ASSERT(buf);
2615 ASSERT(buf->b_data);
2616 if (HDR_BUF_AVAILABLE(hdr)) {
2617 ASSERT(buf->b_efunc == NULL);
2618 hdr->b_flags &= ~ARC_BUF_AVAILABLE;
2619 } else {
2620 buf = arc_buf_clone(buf);
2621 }
2622 } else if (*arc_flags & ARC_PREFETCH &&
2623 refcount_count(&hdr->b_refcnt) == 0) {
2624 hdr->b_flags |= ARC_PREFETCH;
2625 }
2626 DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr);
2627 arc_access(hdr, hash_lock);
2628 if (*arc_flags & ARC_L2CACHE)
2629 hdr->b_flags |= ARC_L2CACHE;
2630 mutex_exit(hash_lock);
2631 ARCSTAT_BUMP(arcstat_hits);
2632 ARCSTAT_CONDSTAT(!(hdr->b_flags & ARC_PREFETCH),
2633 demand, prefetch, hdr->b_type != ARC_BUFC_METADATA,
2634 data, metadata, hits);
2635
2636 if (done)
2637 done(NULL, buf, private);
2638 } else {
2639 uint64_t size = BP_GET_LSIZE(bp);
2640 arc_callback_t *acb;
2641 vdev_t *vd = NULL;
2642 uint64_t addr;
2643 boolean_t devw = B_FALSE;
2644
2645 if (hdr == NULL) {
2646 /* this block is not in the cache */
2647 arc_buf_hdr_t *exists;
2648 arc_buf_contents_t type = BP_GET_BUFC_TYPE(bp);
2649 buf = arc_buf_alloc(spa, size, private, type);
2650 hdr = buf->b_hdr;
2651 hdr->b_dva = *BP_IDENTITY(bp);
2652 hdr->b_birth = bp->blk_birth;
2653 hdr->b_cksum0 = bp->blk_cksum.zc_word[0];
2654 exists = buf_hash_insert(hdr, &hash_lock);
2655 if (exists) {
2656 /* somebody beat us to the hash insert */
2657 mutex_exit(hash_lock);
2658 bzero(&hdr->b_dva, sizeof (dva_t));
2659 hdr->b_birth = 0;
2660 hdr->b_cksum0 = 0;
2661 (void) arc_buf_remove_ref(buf, private);
2662 goto top; /* restart the IO request */
2663 }
2664 /* if this is a prefetch, we don't have a reference */
2665 if (*arc_flags & ARC_PREFETCH) {
2666 (void) remove_reference(hdr, hash_lock,
2667 private);
2668 hdr->b_flags |= ARC_PREFETCH;
2669 }
2670 if (*arc_flags & ARC_L2CACHE)
2671 hdr->b_flags |= ARC_L2CACHE;
2672 if (BP_GET_LEVEL(bp) > 0)
2673 hdr->b_flags |= ARC_INDIRECT;
2674 } else {
2675 /* this block is in the ghost cache */
2676 ASSERT(GHOST_STATE(hdr->b_state));
2677 ASSERT(!HDR_IO_IN_PROGRESS(hdr));
2678 ASSERT3U(refcount_count(&hdr->b_refcnt), ==, 0);
2679 ASSERT(hdr->b_buf == NULL);
2680
2681 /* if this is a prefetch, we don't have a reference */
2682 if (*arc_flags & ARC_PREFETCH)
2683 hdr->b_flags |= ARC_PREFETCH;
2684 else
2685 add_reference(hdr, hash_lock, private);
2686 if (*arc_flags & ARC_L2CACHE)
2687 hdr->b_flags |= ARC_L2CACHE;
2688 buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE);
2689 buf->b_hdr = hdr;
2690 buf->b_data = NULL;
2691 buf->b_efunc = NULL;
2692 buf->b_private = NULL;
2693 buf->b_next = NULL;
2694 hdr->b_buf = buf;
2695 arc_get_data_buf(buf);
2696 ASSERT(hdr->b_datacnt == 0);
2697 hdr->b_datacnt = 1;
2698
2699 }
2700
2701 acb = kmem_zalloc(sizeof (arc_callback_t), KM_SLEEP);
2702 acb->acb_done = done;
2703 acb->acb_private = private;
2704
2705 ASSERT(hdr->b_acb == NULL);
2706 hdr->b_acb = acb;
2707 hdr->b_flags |= ARC_IO_IN_PROGRESS;
2708
2709 /*
2710 * If the buffer has been evicted, migrate it to a present state
2711 * before issuing the I/O. Once we drop the hash-table lock,
2712 * the header will be marked as I/O in progress and have an
2713 * attached buffer. At this point, anybody who finds this
2714 * buffer ought to notice that it's legit but has a pending I/O.
2715 */
2716
2717 if (GHOST_STATE(hdr->b_state))
2718 arc_access(hdr, hash_lock);
2719
2720 if (HDR_L2CACHE(hdr) && hdr->b_l2hdr != NULL &&
2721 (vd = hdr->b_l2hdr->b_dev->l2ad_vdev) != NULL) {
2722 devw = hdr->b_l2hdr->b_dev->l2ad_writing;
2723 addr = hdr->b_l2hdr->b_daddr;
2724 /*
2725 * Lock out device removal.
2726 */
2727 if (vdev_is_dead(vd) ||
2728 !spa_config_tryenter(spa, SCL_L2ARC, vd, RW_READER))
2729 vd = NULL;
2730 }
2731
2732 mutex_exit(hash_lock);
2733
2734 ASSERT3U(hdr->b_size, ==, size);
2735 DTRACE_PROBE3(arc__miss, blkptr_t *, bp, uint64_t, size,
2736 zbookmark_t *, zb);
2737 ARCSTAT_BUMP(arcstat_misses);
2738 ARCSTAT_CONDSTAT(!(hdr->b_flags & ARC_PREFETCH),
2739 demand, prefetch, hdr->b_type != ARC_BUFC_METADATA,
2740 data, metadata, misses);
2741
2742 if (vd != NULL && l2arc_ndev != 0 && !(l2arc_norw && devw)) {
2743 /*
2744 * Read from the L2ARC if the following are true:
2745 * 1. The L2ARC vdev was previously cached.
2746 * 2. This buffer still has L2ARC metadata.
2747 * 3. This buffer isn't currently writing to the L2ARC.
2748 * 4. The L2ARC entry wasn't evicted, which may
2749 * also have invalidated the vdev.
2750 * 5. This isn't prefetch and l2arc_noprefetch is set.
2751 */
2752 if (hdr->b_l2hdr != NULL &&
2753 !HDR_L2_WRITING(hdr) && !HDR_L2_EVICTED(hdr) &&
2754 !(l2arc_noprefetch && HDR_PREFETCH(hdr))) {
2755 l2arc_read_callback_t *cb;
2756
2757 DTRACE_PROBE1(l2arc__hit, arc_buf_hdr_t *, hdr);
2758 ARCSTAT_BUMP(arcstat_l2_hits);
2759
2760 cb = kmem_zalloc(sizeof (l2arc_read_callback_t),
2761 KM_SLEEP);
2762 cb->l2rcb_buf = buf;
2763 cb->l2rcb_spa = spa;
2764 cb->l2rcb_bp = *bp;
2765 cb->l2rcb_zb = *zb;
2766 cb->l2rcb_flags = zio_flags;
2767
2768 /*
2769 * l2arc read. The SCL_L2ARC lock will be
2770 * released by l2arc_read_done().
2771 */
2772 rzio = zio_read_phys(pio, vd, addr, size,
2773 buf->b_data, ZIO_CHECKSUM_OFF,
2774 l2arc_read_done, cb, priority, zio_flags |
2775 ZIO_FLAG_DONT_CACHE | ZIO_FLAG_CANFAIL |
2776 ZIO_FLAG_DONT_PROPAGATE |
2777 ZIO_FLAG_DONT_RETRY, B_FALSE);
2778 DTRACE_PROBE2(l2arc__read, vdev_t *, vd,
2779 zio_t *, rzio);
2780 ARCSTAT_INCR(arcstat_l2_read_bytes, size);
2781
2782 if (*arc_flags & ARC_NOWAIT) {
2783 zio_nowait(rzio);
2784 return (0);
2785 }
2786
2787 ASSERT(*arc_flags & ARC_WAIT);
2788 if (zio_wait(rzio) == 0)
2789 return (0);
2790
2791 /* l2arc read error; goto zio_read() */
2792 } else {
2793 DTRACE_PROBE1(l2arc__miss,
2794 arc_buf_hdr_t *, hdr);
2795 ARCSTAT_BUMP(arcstat_l2_misses);
2796 if (HDR_L2_WRITING(hdr))
2797 ARCSTAT_BUMP(arcstat_l2_rw_clash);
2798 spa_config_exit(spa, SCL_L2ARC, vd);
2799 }
2800 } else {
2801 if (vd != NULL)
2802 spa_config_exit(spa, SCL_L2ARC, vd);
2803 if (l2arc_ndev != 0) {
2804 DTRACE_PROBE1(l2arc__miss,
2805 arc_buf_hdr_t *, hdr);
2806 ARCSTAT_BUMP(arcstat_l2_misses);
2807 }
2808 }
2809
2810 rzio = zio_read(pio, spa, bp, buf->b_data, size,
2811 arc_read_done, buf, priority, zio_flags, zb);
2812
2813 if (*arc_flags & ARC_WAIT)
2814 return (zio_wait(rzio));
2815
2816 ASSERT(*arc_flags & ARC_NOWAIT);
2817 zio_nowait(rzio);
2818 }
2819 return (0);
2820 }
2821
2822 /*
2823 * arc_read() variant to support pool traversal. If the block is already
2824 * in the ARC, make a copy of it; otherwise, the caller will do the I/O.
2825 * The idea is that we don't want pool traversal filling up memory, but
2826 * if the ARC already has the data anyway, we shouldn't pay for the I/O.
2827 */
2828 int
2829 arc_tryread(spa_t *spa, blkptr_t *bp, void *data)
2830 {
2831 arc_buf_hdr_t *hdr;
2832 kmutex_t *hash_mtx;
2833 uint64_t guid = spa_guid(spa);
2834 int rc = 0;
2835
2836 hdr = buf_hash_find(guid, BP_IDENTITY(bp), bp->blk_birth, &hash_mtx);
2837
2838 if (hdr && hdr->b_datacnt > 0 && !HDR_IO_IN_PROGRESS(hdr)) {
2839 arc_buf_t *buf = hdr->b_buf;
2840
2841 ASSERT(buf);
2842 while (buf->b_data == NULL) {
2843 buf = buf->b_next;
2844 ASSERT(buf);
2845 }
2846 bcopy(buf->b_data, data, hdr->b_size);
2847 } else {
2848 rc = ENOENT;
2849 }
2850
2851 if (hash_mtx)
2852 mutex_exit(hash_mtx);
2853
2854 return (rc);
2855 }
2856
2857 void
2858 arc_set_callback(arc_buf_t *buf, arc_evict_func_t *func, void *private)
2859 {
2860 ASSERT(buf->b_hdr != NULL);
2861 ASSERT(buf->b_hdr->b_state != arc_anon);
2862 ASSERT(!refcount_is_zero(&buf->b_hdr->b_refcnt) || func == NULL);
2863 buf->b_efunc = func;
2864 buf->b_private = private;
2865 }
2866
2867 /*
2868 * This is used by the DMU to let the ARC know that a buffer is
2869 * being evicted, so the ARC should clean up. If this arc buf
2870 * is not yet in the evicted state, it will be put there.
2871 */
2872 int
2873 arc_buf_evict(arc_buf_t *buf)
2874 {
2875 arc_buf_hdr_t *hdr;
2876 kmutex_t *hash_lock;
2877 arc_buf_t **bufp;
2878
2879 rw_enter(&buf->b_lock, RW_WRITER);
2880 hdr = buf->b_hdr;
2881 if (hdr == NULL) {
2882 /*
2883 * We are in arc_do_user_evicts().
2884 */
2885 ASSERT(buf->b_data == NULL);
2886 rw_exit(&buf->b_lock);
2887 return (0);
2888 } else if (buf->b_data == NULL) {
2889 arc_buf_t copy = *buf; /* structure assignment */
2890 /*
2891 * We are on the eviction list; process this buffer now
2892 * but let arc_do_user_evicts() do the reaping.
2893 */
2894 buf->b_efunc = NULL;
2895 rw_exit(&buf->b_lock);
2896 VERIFY(copy.b_efunc(&copy) == 0);
2897 return (1);
2898 }
2899 hash_lock = HDR_LOCK(hdr);
2900 mutex_enter(hash_lock);
2901
2902 ASSERT(buf->b_hdr == hdr);
2903 ASSERT3U(refcount_count(&hdr->b_refcnt), <, hdr->b_datacnt);
2904 ASSERT(hdr->b_state == arc_mru || hdr->b_state == arc_mfu);
2905
2906 /*
2907 * Pull this buffer off of the hdr
2908 */
2909 bufp = &hdr->b_buf;
2910 while (*bufp != buf)
2911 bufp = &(*bufp)->b_next;
2912 *bufp = buf->b_next;
2913
2914 ASSERT(buf->b_data != NULL);
2915 arc_buf_destroy(buf, FALSE, FALSE);
2916
2917 if (hdr->b_datacnt == 0) {
2918 arc_state_t *old_state = hdr->b_state;
2919 arc_state_t *evicted_state;
2920
2921 ASSERT(refcount_is_zero(&hdr->b_refcnt));
2922
2923 evicted_state =
2924 (old_state == arc_mru) ? arc_mru_ghost : arc_mfu_ghost;
2925
2926 mutex_enter(&old_state->arcs_mtx);
2927 mutex_enter(&evicted_state->arcs_mtx);
2928
2929 arc_change_state(evicted_state, hdr, hash_lock);
2930 ASSERT(HDR_IN_HASH_TABLE(hdr));
2931 hdr->b_flags |= ARC_IN_HASH_TABLE;
2932 hdr->b_flags &= ~ARC_BUF_AVAILABLE;
2933
2934 mutex_exit(&evicted_state->arcs_mtx);
2935 mutex_exit(&old_state->arcs_mtx);
2936 }
2937 mutex_exit(hash_lock);
2938 rw_exit(&buf->b_lock);
2939
2940 VERIFY(buf->b_efunc(buf) == 0);
2941 buf->b_efunc = NULL;
2942 buf->b_private = NULL;
2943 buf->b_hdr = NULL;
2944 kmem_cache_free(buf_cache, buf);
2945 return (1);
2946 }
2947
2948 /*
2949 * Release this buffer from the cache. This must be done
2950 * after a read and prior to modifying the buffer contents.
2951 * If the buffer has more than one reference, we must make
2952 * a new hdr for the buffer.
2953 */
2954 void
2955 arc_release(arc_buf_t *buf, void *tag)
2956 {
2957 arc_buf_hdr_t *hdr;
2958 kmutex_t *hash_lock;
2959 l2arc_buf_hdr_t *l2hdr;
2960 uint64_t buf_size;
2961 boolean_t released = B_FALSE;
2962
2963 rw_enter(&buf->b_lock, RW_WRITER);
2964 hdr = buf->b_hdr;
2965
2966 /* this buffer is not on any list */
2967 ASSERT(refcount_count(&hdr->b_refcnt) > 0);
2968 ASSERT(!(hdr->b_flags & ARC_STORED));
2969
2970 if (hdr->b_state == arc_anon) {
2971 /* this buffer is already released */
2972 ASSERT3U(refcount_count(&hdr->b_refcnt), ==, 1);
2973 ASSERT(BUF_EMPTY(hdr));
2974 ASSERT(buf->b_efunc == NULL);
2975 arc_buf_thaw(buf);
2976 rw_exit(&buf->b_lock);
2977 released = B_TRUE;
2978 } else {
2979 hash_lock = HDR_LOCK(hdr);
2980 mutex_enter(hash_lock);
2981 }
2982
2983 l2hdr = hdr->b_l2hdr;
2984 if (l2hdr) {
2985 mutex_enter(&l2arc_buflist_mtx);
2986 hdr->b_l2hdr = NULL;
2987 buf_size = hdr->b_size;
2988 }
2989
2990 if (released)
2991 goto out;
2992
2993 /*
2994 * Do we have more than one buf?
2995 */
2996 if (hdr->b_datacnt > 1) {
2997 arc_buf_hdr_t *nhdr;
2998 arc_buf_t **bufp;
2999 uint64_t blksz = hdr->b_size;
3000 uint64_t spa = hdr->b_spa;
3001 arc_buf_contents_t type = hdr->b_type;
3002 uint32_t flags = hdr->b_flags;
3003
3004 ASSERT(hdr->b_buf != buf || buf->b_next != NULL);
3005 /*
3006 * Pull the data off of this buf and attach it to
3007 * a new anonymous buf.
3008 */
3009 (void) remove_reference(hdr, hash_lock, tag);
3010 bufp = &hdr->b_buf;
3011 while (*bufp != buf)
3012 bufp = &(*bufp)->b_next;
3013 *bufp = (*bufp)->b_next;
3014 buf->b_next = NULL;
3015
3016 ASSERT3U(hdr->b_state->arcs_size, >=, hdr->b_size);
3017 atomic_add_64(&hdr->b_state->arcs_size, -hdr->b_size);
3018 if (refcount_is_zero(&hdr->b_refcnt)) {
3019 uint64_t *size = &hdr->b_state->arcs_lsize[hdr->b_type];
3020 ASSERT3U(*size, >=, hdr->b_size);
3021 atomic_add_64(size, -hdr->b_size);
3022 }
3023 hdr->b_datacnt -= 1;
3024 arc_cksum_verify(buf);
3025
3026 mutex_exit(hash_lock);
3027
3028 nhdr = kmem_cache_alloc(hdr_cache, KM_PUSHPAGE);
3029 nhdr->b_size = blksz;
3030 nhdr->b_spa = spa;
3031 nhdr->b_type = type;
3032 nhdr->b_buf = buf;
3033 nhdr->b_state = arc_anon;
3034 nhdr->b_arc_access = 0;
3035 nhdr->b_flags = flags & ARC_L2_WRITING;
3036 nhdr->b_l2hdr = NULL;
3037 nhdr->b_datacnt = 1;
3038 nhdr->b_freeze_cksum = NULL;
3039 (void) refcount_add(&nhdr->b_refcnt, tag);
3040 buf->b_hdr = nhdr;
3041 rw_exit(&buf->b_lock);
3042 atomic_add_64(&arc_anon->arcs_size, blksz);
3043 } else {
3044 rw_exit(&buf->b_lock);
3045 ASSERT(refcount_count(&hdr->b_refcnt) == 1);
3046 ASSERT(!list_link_active(&hdr->b_arc_node));
3047 ASSERT(!HDR_IO_IN_PROGRESS(hdr));
3048 arc_change_state(arc_anon, hdr, hash_lock);
3049 hdr->b_arc_access = 0;
3050 mutex_exit(hash_lock);
3051
3052 bzero(&hdr->b_dva, sizeof (dva_t));
3053 hdr->b_birth = 0;
3054 hdr->b_cksum0 = 0;
3055 arc_buf_thaw(buf);
3056 }
3057 buf->b_efunc = NULL;
3058 buf->b_private = NULL;
3059
3060 out:
3061 if (l2hdr) {
3062 list_remove(l2hdr->b_dev->l2ad_buflist, hdr);
3063 kmem_free(l2hdr, sizeof (l2arc_buf_hdr_t));
3064 ARCSTAT_INCR(arcstat_l2_size, -buf_size);
3065 mutex_exit(&l2arc_buflist_mtx);
3066 }
3067 }
3068
3069 int
3070 arc_released(arc_buf_t *buf)
3071 {
3072 int released;
3073
3074 rw_enter(&buf->b_lock, RW_READER);
3075 released = (buf->b_data != NULL && buf->b_hdr->b_state == arc_anon);
3076 rw_exit(&buf->b_lock);
3077 return (released);
3078 }
3079
3080 int
3081 arc_has_callback(arc_buf_t *buf)
3082 {
3083 int callback;
3084
3085 rw_enter(&buf->b_lock, RW_READER);
3086 callback = (buf->b_efunc != NULL);
3087 rw_exit(&buf->b_lock);
3088 return (callback);
3089 }
3090
3091 #ifdef ZFS_DEBUG
3092 int
3093 arc_referenced(arc_buf_t *buf)
3094 {
3095 int referenced;
3096
3097 rw_enter(&buf->b_lock, RW_READER);
3098 referenced = (refcount_count(&buf->b_hdr->b_refcnt));
3099 rw_exit(&buf->b_lock);
3100 return (referenced);
3101 }
3102 #endif
3103
3104 static void
3105 arc_write_ready(zio_t *zio)
3106 {
3107 arc_write_callback_t *callback = zio->io_private;
3108 arc_buf_t *buf = callback->awcb_buf;
3109 arc_buf_hdr_t *hdr = buf->b_hdr;
3110
3111 ASSERT(!refcount_is_zero(&buf->b_hdr->b_refcnt));
3112 callback->awcb_ready(zio, buf, callback->awcb_private);
3113
3114 /*
3115 * If the IO is already in progress, then this is a re-write
3116 * attempt, so we need to thaw and re-compute the cksum.
3117 * It is the responsibility of the callback to handle the
3118 * accounting for any re-write attempt.
3119 */
3120 if (HDR_IO_IN_PROGRESS(hdr)) {
3121 mutex_enter(&hdr->b_freeze_lock);
3122 if (hdr->b_freeze_cksum != NULL) {
3123 kmem_free(hdr->b_freeze_cksum, sizeof (zio_cksum_t));
3124 hdr->b_freeze_cksum = NULL;
3125 }
3126 mutex_exit(&hdr->b_freeze_lock);
3127 }
3128 arc_cksum_compute(buf, B_FALSE);
3129 hdr->b_flags |= ARC_IO_IN_PROGRESS;
3130 }
3131
3132 static void
3133 arc_write_done(zio_t *zio)
3134 {
3135 arc_write_callback_t *callback = zio->io_private;
3136 arc_buf_t *buf = callback->awcb_buf;
3137 arc_buf_hdr_t *hdr = buf->b_hdr;
3138
3139 hdr->b_acb = NULL;
3140
3141 hdr->b_dva = *BP_IDENTITY(zio->io_bp);
3142 hdr->b_birth = zio->io_bp->blk_birth;
3143 hdr->b_cksum0 = zio->io_bp->blk_cksum.zc_word[0];
3144 /*
3145 * If the block to be written was all-zero, we may have
3146 * compressed it away. In this case no write was performed
3147 * so there will be no dva/birth-date/checksum. The buffer
3148 * must therefor remain anonymous (and uncached).
3149 */
3150 if (!BUF_EMPTY(hdr)) {
3151 arc_buf_hdr_t *exists;
3152 kmutex_t *hash_lock;
3153
3154 arc_cksum_verify(buf);
3155
3156 exists = buf_hash_insert(hdr, &hash_lock);
3157 if (exists) {
3158 /*
3159 * This can only happen if we overwrite for
3160 * sync-to-convergence, because we remove
3161 * buffers from the hash table when we arc_free().
3162 */
3163 ASSERT(zio->io_flags & ZIO_FLAG_IO_REWRITE);
3164 ASSERT(DVA_EQUAL(BP_IDENTITY(&zio->io_bp_orig),
3165 BP_IDENTITY(zio->io_bp)));
3166 ASSERT3U(zio->io_bp_orig.blk_birth, ==,
3167 zio->io_bp->blk_birth);
3168
3169 ASSERT(refcount_is_zero(&exists->b_refcnt));
3170 arc_change_state(arc_anon, exists, hash_lock);
3171 mutex_exit(hash_lock);
3172 arc_hdr_destroy(exists);
3173 exists = buf_hash_insert(hdr, &hash_lock);
3174 ASSERT3P(exists, ==, NULL);
3175 }
3176 hdr->b_flags &= ~ARC_IO_IN_PROGRESS;
3177 /* if it's not anon, we are doing a scrub */
3178 if (hdr->b_state == arc_anon)
3179 arc_access(hdr, hash_lock);
3180 mutex_exit(hash_lock);
3181 } else if (callback->awcb_done == NULL) {
3182 int destroy_hdr;
3183 /*
3184 * This is an anonymous buffer with no user callback,
3185 * destroy it if there are no active references.
3186 */
3187 mutex_enter(&arc_eviction_mtx);
3188 destroy_hdr = refcount_is_zero(&hdr->b_refcnt);
3189 hdr->b_flags &= ~ARC_IO_IN_PROGRESS;
3190 mutex_exit(&arc_eviction_mtx);
3191 if (destroy_hdr)
3192 arc_hdr_destroy(hdr);
3193 } else {
3194 hdr->b_flags &= ~ARC_IO_IN_PROGRESS;
3195 }
3196 hdr->b_flags &= ~ARC_STORED;
3197
3198 if (callback->awcb_done) {
3199 ASSERT(!refcount_is_zero(&hdr->b_refcnt));
3200 callback->awcb_done(zio, buf, callback->awcb_private);
3201 }
3202
3203 kmem_free(callback, sizeof (arc_write_callback_t));
3204 }
3205
3206 void
3207 write_policy(spa_t *spa, const writeprops_t *wp, zio_prop_t *zp)
3208 {
3209 boolean_t ismd = (wp->wp_level > 0 || dmu_ot[wp->wp_type].ot_metadata);
3210
3211 /* Determine checksum setting */
3212 if (ismd) {
3213 /*
3214 * Metadata always gets checksummed. If the data
3215 * checksum is multi-bit correctable, and it's not a
3216 * ZBT-style checksum, then it's suitable for metadata
3217 * as well. Otherwise, the metadata checksum defaults
3218 * to fletcher4.
3219 */
3220 if (zio_checksum_table[wp->wp_oschecksum].ci_correctable &&
3221 !zio_checksum_table[wp->wp_oschecksum].ci_zbt)
3222 zp->zp_checksum = wp->wp_oschecksum;
3223 else
3224 zp->zp_checksum = ZIO_CHECKSUM_FLETCHER_4;
3225 } else {
3226 zp->zp_checksum = zio_checksum_select(wp->wp_dnchecksum,
3227 wp->wp_oschecksum);
3228 }
3229
3230 /* Determine compression setting */
3231 if (ismd) {
3232 /*
3233 * XXX -- we should design a compression algorithm
3234 * that specializes in arrays of bps.
3235 */
3236 zp->zp_compress = zfs_mdcomp_disable ? ZIO_COMPRESS_EMPTY :
3237 ZIO_COMPRESS_LZJB;
3238 } else {
3239 zp->zp_compress = zio_compress_select(wp->wp_dncompress,
3240 wp->wp_oscompress);
3241 }
3242
3243 zp->zp_type = wp->wp_type;
3244 zp->zp_level = wp->wp_level;
3245 zp->zp_ndvas = MIN(wp->wp_copies + ismd, spa_max_replication(spa));
3246 }
3247
3248 zio_t *
3249 arc_write(zio_t *pio, spa_t *spa, const writeprops_t *wp,
3250 boolean_t l2arc, uint64_t txg, blkptr_t *bp, arc_buf_t *buf,
3251 arc_done_func_t *ready, arc_done_func_t *done, void *private, int priority,
3252 int zio_flags, const zbookmark_t *zb)
3253 {
3254 arc_buf_hdr_t *hdr = buf->b_hdr;
3255 arc_write_callback_t *callback;
3256 zio_t *zio;
3257 zio_prop_t zp;
3258
3259 ASSERT(ready != NULL);
3260 ASSERT(!HDR_IO_ERROR(hdr));
3261 ASSERT((hdr->b_flags & ARC_IO_IN_PROGRESS) == 0);
3262 ASSERT(hdr->b_acb == 0);
3263 if (l2arc)
3264 hdr->b_flags |= ARC_L2CACHE;
3265 callback = kmem_zalloc(sizeof (arc_write_callback_t), KM_SLEEP);
3266 callback->awcb_ready = ready;
3267 callback->awcb_done = done;
3268 callback->awcb_private = private;
3269 callback->awcb_buf = buf;
3270
3271 write_policy(spa, wp, &zp);
3272 zio = zio_write(pio, spa, txg, bp, buf->b_data, hdr->b_size, &zp,
3273 arc_write_ready, arc_write_done, callback, priority, zio_flags, zb);
3274
3275 return (zio);
3276 }
3277
3278 int
3279 arc_free(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
3280 zio_done_func_t *done, void *private, uint32_t arc_flags)
3281 {
3282 arc_buf_hdr_t *ab;
3283 kmutex_t *hash_lock;
3284 zio_t *zio;
3285 uint64_t guid = spa_guid(spa);
3286
3287 /*
3288 * If this buffer is in the cache, release it, so it
3289 * can be re-used.
3290 */
3291 ab = buf_hash_find(guid, BP_IDENTITY(bp), bp->blk_birth, &hash_lock);
3292 if (ab != NULL) {
3293 /*
3294 * The checksum of blocks to free is not always
3295 * preserved (eg. on the deadlist). However, if it is
3296 * nonzero, it should match what we have in the cache.
3297 */
3298 ASSERT(bp->blk_cksum.zc_word[0] == 0 ||
3299 bp->blk_cksum.zc_word[0] == ab->b_cksum0 ||
3300 bp->blk_fill == BLK_FILL_ALREADY_FREED);
3301
3302 if (ab->b_state != arc_anon)
3303 arc_change_state(arc_anon, ab, hash_lock);
3304 if (HDR_IO_IN_PROGRESS(ab)) {
3305 /*
3306 * This should only happen when we prefetch.
3307 */
3308 ASSERT(ab->b_flags & ARC_PREFETCH);
3309 ASSERT3U(ab->b_datacnt, ==, 1);
3310 ab->b_flags |= ARC_FREED_IN_READ;
3311 if (HDR_IN_HASH_TABLE(ab))
3312 buf_hash_remove(ab);
3313 ab->b_arc_access = 0;
3314 bzero(&ab->b_dva, sizeof (dva_t));
3315 ab->b_birth = 0;
3316 ab->b_cksum0 = 0;
3317 ab->b_buf->b_efunc = NULL;
3318 ab->b_buf->b_private = NULL;
3319 mutex_exit(hash_lock);
3320 } else if (refcount_is_zero(&ab->b_refcnt)) {
3321 ab->b_flags |= ARC_FREE_IN_PROGRESS;
3322 mutex_exit(hash_lock);
3323 arc_hdr_destroy(ab);
3324 ARCSTAT_BUMP(arcstat_deleted);
3325 } else {
3326 /*
3327 * We still have an active reference on this
3328 * buffer. This can happen, e.g., from
3329 * dbuf_unoverride().
3330 */
3331 ASSERT(!HDR_IN_HASH_TABLE(ab));
3332 ab->b_arc_access = 0;
3333 bzero(&ab->b_dva, sizeof (dva_t));
3334 ab->b_birth = 0;
3335 ab->b_cksum0 = 0;
3336 ab->b_buf->b_efunc = NULL;
3337 ab->b_buf->b_private = NULL;
3338 mutex_exit(hash_lock);
3339 }
3340 }
3341
3342 zio = zio_free(pio, spa, txg, bp, done, private, ZIO_FLAG_MUSTSUCCEED);
3343
3344 if (arc_flags & ARC_WAIT)
3345 return (zio_wait(zio));
3346
3347 ASSERT(arc_flags & ARC_NOWAIT);
3348 zio_nowait(zio);
3349
3350 return (0);
3351 }
3352
3353 static int
3354 arc_memory_throttle(uint64_t reserve, uint64_t inflight_data, uint64_t txg)
3355 {
3356 #ifdef _KERNEL
3357 uint64_t available_memory = ptob(freemem);
3358 static uint64_t page_load = 0;
3359 static uint64_t last_txg = 0;
3360
3361 #if defined(__i386)
3362 available_memory =
3363 MIN(available_memory, vmem_size(heap_arena, VMEM_FREE));
3364 #endif
3365 if (available_memory >= zfs_write_limit_max)
3366 return (0);
3367
3368 if (txg > last_txg) {
3369 last_txg = txg;
3370 page_load = 0;
3371 }
3372 /*
3373 * If we are in pageout, we know that memory is already tight,
3374 * the arc is already going to be evicting, so we just want to
3375 * continue to let page writes occur as quickly as possible.
3376 */
3377 if (curproc == proc_pageout) {
3378 if (page_load > MAX(ptob(minfree), available_memory) / 4)
3379 return (ERESTART);
3380 /* Note: reserve is inflated, so we deflate */
3381 page_load += reserve / 8;
3382 return (0);
3383 } else if (page_load > 0 && arc_reclaim_needed()) {
3384 /* memory is low, delay before restarting */
3385 ARCSTAT_INCR(arcstat_memory_throttle_count, 1);
3386 return (EAGAIN);
3387 }
3388 page_load = 0;
3389
3390 if (arc_size > arc_c_min) {
3391 uint64_t evictable_memory =
3392 arc_mru->arcs_lsize[ARC_BUFC_DATA] +
3393 arc_mru->arcs_lsize[ARC_BUFC_METADATA] +
3394 arc_mfu->arcs_lsize[ARC_BUFC_DATA] +
3395 arc_mfu->arcs_lsize[ARC_BUFC_METADATA];
3396 available_memory += MIN(evictable_memory, arc_size - arc_c_min);
3397 }
3398
3399 if (inflight_data > available_memory / 4) {
3400 ARCSTAT_INCR(arcstat_memory_throttle_count, 1);
3401 return (ERESTART);
3402 }
3403 #endif
3404 return (0);
3405 }
3406
3407 void
3408 arc_tempreserve_clear(uint64_t reserve)
3409 {
3410 atomic_add_64(&arc_tempreserve, -reserve);
3411 ASSERT((int64_t)arc_tempreserve >= 0);
3412 }
3413
3414 int
3415 arc_tempreserve_space(uint64_t reserve, uint64_t txg)
3416 {
3417 int error;
3418 uint64_t anon_size;
3419
3420 #ifdef ZFS_DEBUG
3421 /*
3422 * Once in a while, fail for no reason. Everything should cope.
3423 */
3424 if (spa_get_random(10000) == 0) {
3425 dprintf("forcing random failure\n");
3426 return (ERESTART);
3427 }
3428 #endif
3429 if (reserve > arc_c/4 && !arc_no_grow)
3430 arc_c = MIN(arc_c_max, reserve * 4);
3431 if (reserve > arc_c)
3432 return (ENOMEM);
3433
3434 /*
3435 * Don't count loaned bufs as in flight dirty data to prevent long
3436 * network delays from blocking transactions that are ready to be
3437 * assigned to a txg.
3438 */
3439 anon_size = MAX((int64_t)(arc_anon->arcs_size - arc_loaned_bytes), 0);
3440
3441 /*
3442 * Writes will, almost always, require additional memory allocations
3443 * in order to compress/encrypt/etc the data. We therefor need to
3444 * make sure that there is sufficient available memory for this.
3445 */
3446 if (error = arc_memory_throttle(reserve, anon_size, txg))
3447 return (error);
3448
3449 /*
3450 * Throttle writes when the amount of dirty data in the cache
3451 * gets too large. We try to keep the cache less than half full
3452 * of dirty blocks so that our sync times don't grow too large.
3453 * Note: if two requests come in concurrently, we might let them
3454 * both succeed, when one of them should fail. Not a huge deal.
3455 */
3456
3457 if (reserve + arc_tempreserve + anon_size > arc_c / 2 &&
3458 anon_size > arc_c / 4) {
3459 dprintf("failing, arc_tempreserve=%lluK anon_meta=%lluK "
3460 "anon_data=%lluK tempreserve=%lluK arc_c=%lluK\n",
3461 arc_tempreserve>>10,
3462 arc_anon->arcs_lsize[ARC_BUFC_METADATA]>>10,
3463 arc_anon->arcs_lsize[ARC_BUFC_DATA]>>10,
3464 reserve>>10, arc_c>>10);
3465 return (ERESTART);
3466 }
3467 atomic_add_64(&arc_tempreserve, reserve);
3468 return (0);
3469 }
3470
3471 void
3472 arc_init(void)
3473 {
3474 mutex_init(&arc_reclaim_thr_lock, NULL, MUTEX_DEFAULT, NULL);
3475 cv_init(&arc_reclaim_thr_cv, NULL, CV_DEFAULT, NULL);
3476
3477 /* Convert seconds to clock ticks */
3478 arc_min_prefetch_lifespan = 1 * hz;
3479
3480 /* Start out with 1/8 of all memory */
3481 arc_c = physmem * PAGESIZE / 8;
3482
3483 #ifdef _KERNEL
3484 /*
3485 * On architectures where the physical memory can be larger
3486 * than the addressable space (intel in 32-bit mode), we may
3487 * need to limit the cache to 1/8 of VM size.
3488 */
3489 arc_c = MIN(arc_c, vmem_size(heap_arena, VMEM_ALLOC | VMEM_FREE) / 8);
3490 #endif
3491
3492 /* set min cache to 1/32 of all memory, or 64MB, whichever is more */
3493 arc_c_min = MAX(arc_c / 4, 64<<20);
3494 /* set max to 3/4 of all memory, or all but 1GB, whichever is more */
3495 if (arc_c * 8 >= 1<<30)
3496 arc_c_max = (arc_c * 8) - (1<<30);
3497 else
3498 arc_c_max = arc_c_min;
3499 arc_c_max = MAX(arc_c * 6, arc_c_max);
3500
3501 /*
3502 * Allow the tunables to override our calculations if they are
3503 * reasonable (ie. over 64MB)
3504 */
3505 if (zfs_arc_max > 64<<20 && zfs_arc_max < physmem * PAGESIZE)
3506 arc_c_max = zfs_arc_max;
3507 if (zfs_arc_min > 64<<20 && zfs_arc_min <= arc_c_max)
3508 arc_c_min = zfs_arc_min;
3509
3510 arc_c = arc_c_max;
3511 arc_p = (arc_c >> 1);
3512
3513 /* limit meta-data to 1/4 of the arc capacity */
3514 arc_meta_limit = arc_c_max / 4;
3515
3516 /* Allow the tunable to override if it is reasonable */
3517 if (zfs_arc_meta_limit > 0 && zfs_arc_meta_limit <= arc_c_max)
3518 arc_meta_limit = zfs_arc_meta_limit;
3519
3520 if (arc_c_min < arc_meta_limit / 2 && zfs_arc_min == 0)
3521 arc_c_min = arc_meta_limit / 2;
3522
3523 if (zfs_arc_grow_retry > 0)
3524 arc_grow_retry = zfs_arc_grow_retry;
3525
3526 if (zfs_arc_shrink_shift > 0)
3527 arc_shrink_shift = zfs_arc_shrink_shift;
3528
3529 if (zfs_arc_p_min_shift > 0)
3530 arc_p_min_shift = zfs_arc_p_min_shift;
3531
3532 /* if kmem_flags are set, lets try to use less memory */
3533 if (kmem_debugging())
3534 arc_c = arc_c / 2;
3535 if (arc_c < arc_c_min)
3536 arc_c = arc_c_min;
3537
3538 arc_anon = &ARC_anon;
3539 arc_mru = &ARC_mru;
3540 arc_mru_ghost = &ARC_mru_ghost;
3541 arc_mfu = &ARC_mfu;
3542 arc_mfu_ghost = &ARC_mfu_ghost;
3543 arc_l2c_only = &ARC_l2c_only;
3544 arc_size = 0;
3545
3546 mutex_init(&arc_anon->arcs_mtx, NULL, MUTEX_DEFAULT, NULL);
3547 mutex_init(&arc_mru->arcs_mtx, NULL, MUTEX_DEFAULT, NULL);
3548 mutex_init(&arc_mru_ghost->arcs_mtx, NULL, MUTEX_DEFAULT, NULL);
3549 mutex_init(&arc_mfu->arcs_mtx, NULL, MUTEX_DEFAULT, NULL);
3550 mutex_init(&arc_mfu_ghost->arcs_mtx, NULL, MUTEX_DEFAULT, NULL);
3551 mutex_init(&arc_l2c_only->arcs_mtx, NULL, MUTEX_DEFAULT, NULL);
3552
3553 list_create(&arc_mru->arcs_list[ARC_BUFC_METADATA],
3554 sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
3555 list_create(&arc_mru->arcs_list[ARC_BUFC_DATA],
3556 sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
3557 list_create(&arc_mru_ghost->arcs_list[ARC_BUFC_METADATA],
3558 sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
3559 list_create(&arc_mru_ghost->arcs_list[ARC_BUFC_DATA],
3560 sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
3561 list_create(&arc_mfu->arcs_list[ARC_BUFC_METADATA],
3562 sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
3563 list_create(&arc_mfu->arcs_list[ARC_BUFC_DATA],
3564 sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
3565 list_create(&arc_mfu_ghost->arcs_list[ARC_BUFC_METADATA],
3566 sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
3567 list_create(&arc_mfu_ghost->arcs_list[ARC_BUFC_DATA],
3568 sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
3569 list_create(&arc_l2c_only->arcs_list[ARC_BUFC_METADATA],
3570 sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
3571 list_create(&arc_l2c_only->arcs_list[ARC_BUFC_DATA],
3572 sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
3573
3574 buf_init();
3575
3576 arc_thread_exit = 0;
3577 arc_eviction_list = NULL;
3578 mutex_init(&arc_eviction_mtx, NULL, MUTEX_DEFAULT, NULL);
3579 bzero(&arc_eviction_hdr, sizeof (arc_buf_hdr_t));
3580
3581 arc_ksp = kstat_create("zfs", 0, "arcstats", "misc", KSTAT_TYPE_NAMED,
3582 sizeof (arc_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL);
3583
3584 if (arc_ksp != NULL) {
3585 arc_ksp->ks_data = &arc_stats;
3586 kstat_install(arc_ksp);
3587 }
3588
3589 (void) thread_create(NULL, 0, arc_reclaim_thread, NULL, 0, &p0,
3590 TS_RUN, minclsyspri);
3591
3592 arc_dead = FALSE;
3593 arc_warm = B_FALSE;
3594
3595 if (zfs_write_limit_max == 0)
3596 zfs_write_limit_max = ptob(physmem) >> zfs_write_limit_shift;
3597 else
3598 zfs_write_limit_shift = 0;
3599 mutex_init(&zfs_write_limit_lock, NULL, MUTEX_DEFAULT, NULL);
3600 }
3601
3602 void
3603 arc_fini(void)
3604 {
3605 mutex_enter(&arc_reclaim_thr_lock);
3606 arc_thread_exit = 1;
3607 while (arc_thread_exit != 0)
3608 cv_wait(&arc_reclaim_thr_cv, &arc_reclaim_thr_lock);
3609 mutex_exit(&arc_reclaim_thr_lock);
3610
3611 arc_flush(NULL);
3612
3613 arc_dead = TRUE;
3614
3615 if (arc_ksp != NULL) {
3616 kstat_delete(arc_ksp);
3617 arc_ksp = NULL;
3618 }
3619
3620 mutex_destroy(&arc_eviction_mtx);
3621 mutex_destroy(&arc_reclaim_thr_lock);
3622 cv_destroy(&arc_reclaim_thr_cv);
3623
3624 list_destroy(&arc_mru->arcs_list[ARC_BUFC_METADATA]);
3625 list_destroy(&arc_mru_ghost->arcs_list[ARC_BUFC_METADATA]);
3626 list_destroy(&arc_mfu->arcs_list[ARC_BUFC_METADATA]);
3627 list_destroy(&arc_mfu_ghost->arcs_list[ARC_BUFC_METADATA]);
3628 list_destroy(&arc_mru->arcs_list[ARC_BUFC_DATA]);
3629 list_destroy(&arc_mru_ghost->arcs_list[ARC_BUFC_DATA]);
3630 list_destroy(&arc_mfu->arcs_list[ARC_BUFC_DATA]);
3631 list_destroy(&arc_mfu_ghost->arcs_list[ARC_BUFC_DATA]);
3632
3633 mutex_destroy(&arc_anon->arcs_mtx);
3634 mutex_destroy(&arc_mru->arcs_mtx);
3635 mutex_destroy(&arc_mru_ghost->arcs_mtx);
3636 mutex_destroy(&arc_mfu->arcs_mtx);
3637 mutex_destroy(&arc_mfu_ghost->arcs_mtx);
3638 mutex_destroy(&arc_l2c_only->arcs_mtx);
3639
3640 mutex_destroy(&zfs_write_limit_lock);
3641
3642 buf_fini();
3643
3644 ASSERT(arc_loaned_bytes == 0);
3645 }
3646
3647 /*
3648 * Level 2 ARC
3649 *
3650 * The level 2 ARC (L2ARC) is a cache layer in-between main memory and disk.
3651 * It uses dedicated storage devices to hold cached data, which are populated
3652 * using large infrequent writes. The main role of this cache is to boost
3653 * the performance of random read workloads. The intended L2ARC devices
3654 * include short-stroked disks, solid state disks, and other media with
3655 * substantially faster read latency than disk.
3656 *
3657 * +-----------------------+
3658 * | ARC |
3659 * +-----------------------+
3660 * | ^ ^
3661 * | | |
3662 * l2arc_feed_thread() arc_read()
3663 * | | |
3664 * | l2arc read |
3665 * V | |
3666 * +---------------+ |
3667 * | L2ARC | |
3668 * +---------------+ |
3669 * | ^ |
3670 * l2arc_write() | |
3671 * | | |
3672 * V | |
3673 * +-------+ +-------+
3674 * | vdev | | vdev |
3675 * | cache | | cache |
3676 * +-------+ +-------+
3677 * +=========+ .-----.
3678 * : L2ARC : |-_____-|
3679 * : devices : | Disks |
3680 * +=========+ `-_____-'
3681 *
3682 * Read requests are satisfied from the following sources, in order:
3683 *
3684 * 1) ARC
3685 * 2) vdev cache of L2ARC devices
3686 * 3) L2ARC devices
3687 * 4) vdev cache of disks
3688 * 5) disks
3689 *
3690 * Some L2ARC device types exhibit extremely slow write performance.
3691 * To accommodate for this there are some significant differences between
3692 * the L2ARC and traditional cache design:
3693 *
3694 * 1. There is no eviction path from the ARC to the L2ARC. Evictions from
3695 * the ARC behave as usual, freeing buffers and placing headers on ghost
3696 * lists. The ARC does not send buffers to the L2ARC during eviction as
3697 * this would add inflated write latencies for all ARC memory pressure.
3698 *
3699 * 2. The L2ARC attempts to cache data from the ARC before it is evicted.
3700 * It does this by periodically scanning buffers from the eviction-end of
3701 * the MFU and MRU ARC lists, copying them to the L2ARC devices if they are
3702 * not already there. It scans until a headroom of buffers is satisfied,
3703 * which itself is a buffer for ARC eviction. The thread that does this is
3704 * l2arc_feed_thread(), illustrated below; example sizes are included to
3705 * provide a better sense of ratio than this diagram:
3706 *
3707 * head --> tail
3708 * +---------------------+----------+
3709 * ARC_mfu |:::::#:::::::::::::::|o#o###o###|-->. # already on L2ARC
3710 * +---------------------+----------+ | o L2ARC eligible
3711 * ARC_mru |:#:::::::::::::::::::|#o#ooo####|-->| : ARC buffer
3712 * +---------------------+----------+ |
3713 * 15.9 Gbytes ^ 32 Mbytes |
3714 * headroom |
3715 * l2arc_feed_thread()
3716 * |
3717 * l2arc write hand <--[oooo]--'
3718 * | 8 Mbyte
3719 * | write max
3720 * V
3721 * +==============================+
3722 * L2ARC dev |####|#|###|###| |####| ... |
3723 * +==============================+
3724 * 32 Gbytes
3725 *
3726 * 3. If an ARC buffer is copied to the L2ARC but then hit instead of
3727 * evicted, then the L2ARC has cached a buffer much sooner than it probably
3728 * needed to, potentially wasting L2ARC device bandwidth and storage. It is
3729 * safe to say that this is an uncommon case, since buffers at the end of
3730 * the ARC lists have moved there due to inactivity.
3731 *
3732 * 4. If the ARC evicts faster than the L2ARC can maintain a headroom,
3733 * then the L2ARC simply misses copying some buffers. This serves as a
3734 * pressure valve to prevent heavy read workloads from both stalling the ARC
3735 * with waits and clogging the L2ARC with writes. This also helps prevent
3736 * the potential for the L2ARC to churn if it attempts to cache content too
3737 * quickly, such as during backups of the entire pool.
3738 *
3739 * 5. After system boot and before the ARC has filled main memory, there are
3740 * no evictions from the ARC and so the tails of the ARC_mfu and ARC_mru
3741 * lists can remain mostly static. Instead of searching from tail of these
3742 * lists as pictured, the l2arc_feed_thread() will search from the list heads
3743 * for eligible buffers, greatly increasing its chance of finding them.
3744 *
3745 * The L2ARC device write speed is also boosted during this time so that
3746 * the L2ARC warms up faster. Since there have been no ARC evictions yet,
3747 * there are no L2ARC reads, and no fear of degrading read performance
3748 * through increased writes.
3749 *
3750 * 6. Writes to the L2ARC devices are grouped and sent in-sequence, so that
3751 * the vdev queue can aggregate them into larger and fewer writes. Each
3752 * device is written to in a rotor fashion, sweeping writes through
3753 * available space then repeating.
3754 *
3755 * 7. The L2ARC does not store dirty content. It never needs to flush
3756 * write buffers back to disk based storage.
3757 *
3758 * 8. If an ARC buffer is written (and dirtied) which also exists in the
3759 * L2ARC, the now stale L2ARC buffer is immediately dropped.
3760 *
3761 * The performance of the L2ARC can be tweaked by a number of tunables, which
3762 * may be necessary for different workloads:
3763 *
3764 * l2arc_write_max max write bytes per interval
3765 * l2arc_write_boost extra write bytes during device warmup
3766 * l2arc_noprefetch skip caching prefetched buffers
3767 * l2arc_headroom number of max device writes to precache
3768 * l2arc_feed_secs seconds between L2ARC writing
3769 *
3770 * Tunables may be removed or added as future performance improvements are
3771 * integrated, and also may become zpool properties.
3772 *
3773 * There are three key functions that control how the L2ARC warms up:
3774 *
3775 * l2arc_write_eligible() check if a buffer is eligible to cache
3776 * l2arc_write_size() calculate how much to write
3777 * l2arc_write_interval() calculate sleep delay between writes
3778 *
3779 * These three functions determine what to write, how much, and how quickly
3780 * to send writes.
3781 */
3782
3783 static boolean_t
3784 l2arc_write_eligible(uint64_t spa_guid, arc_buf_hdr_t *ab)
3785 {
3786 /*
3787 * A buffer is *not* eligible for the L2ARC if it:
3788 * 1. belongs to a different spa.
3789 * 2. has no attached buffer.
3790 * 3. is already cached on the L2ARC.
3791 * 4. has an I/O in progress (it may be an incomplete read).
3792 * 5. is flagged not eligible (zfs property).
3793 */
3794 if (ab->b_spa != spa_guid || ab->b_buf == NULL || ab->b_l2hdr != NULL ||
3795 HDR_IO_IN_PROGRESS(ab) || !HDR_L2CACHE(ab))
3796 return (B_FALSE);
3797
3798 return (B_TRUE);
3799 }
3800
3801 static uint64_t
3802 l2arc_write_size(l2arc_dev_t *dev)
3803 {
3804 uint64_t size;
3805
3806 size = dev->l2ad_write;
3807
3808 if (arc_warm == B_FALSE)
3809 size += dev->l2ad_boost;
3810
3811 return (size);
3812
3813 }
3814
3815 static clock_t
3816 l2arc_write_interval(clock_t began, uint64_t wanted, uint64_t wrote)
3817 {
3818 clock_t interval, next;
3819
3820 /*
3821 * If the ARC lists are busy, increase our write rate; if the
3822 * lists are stale, idle back. This is achieved by checking
3823 * how much we previously wrote - if it was more than half of
3824 * what we wanted, schedule the next write much sooner.
3825 */
3826 if (l2arc_feed_again && wrote > (wanted / 2))
3827 interval = (hz * l2arc_feed_min_ms) / 1000;
3828 else
3829 interval = hz * l2arc_feed_secs;
3830
3831 next = MAX(lbolt, MIN(lbolt + interval, began + interval));
3832
3833 return (next);
3834 }
3835
3836 static void
3837 l2arc_hdr_stat_add(void)
3838 {
3839 ARCSTAT_INCR(arcstat_l2_hdr_size, HDR_SIZE + L2HDR_SIZE);
3840 ARCSTAT_INCR(arcstat_hdr_size, -HDR_SIZE);
3841 }
3842
3843 static void
3844 l2arc_hdr_stat_remove(void)
3845 {
3846 ARCSTAT_INCR(arcstat_l2_hdr_size, -(HDR_SIZE + L2HDR_SIZE));
3847 ARCSTAT_INCR(arcstat_hdr_size, HDR_SIZE);
3848 }
3849
3850 /*
3851 * Cycle through L2ARC devices. This is how L2ARC load balances.
3852 * If a device is returned, this also returns holding the spa config lock.
3853 */
3854 static l2arc_dev_t *
3855 l2arc_dev_get_next(void)
3856 {
3857 l2arc_dev_t *first, *next = NULL;
3858
3859 /*
3860 * Lock out the removal of spas (spa_namespace_lock), then removal
3861 * of cache devices (l2arc_dev_mtx). Once a device has been selected,
3862 * both locks will be dropped and a spa config lock held instead.
3863 */
3864 mutex_enter(&spa_namespace_lock);
3865 mutex_enter(&l2arc_dev_mtx);
3866
3867 /* if there are no vdevs, there is nothing to do */
3868 if (l2arc_ndev == 0)
3869 goto out;
3870
3871 first = NULL;
3872 next = l2arc_dev_last;
3873 do {
3874 /* loop around the list looking for a non-faulted vdev */
3875 if (next == NULL) {
3876 next = list_head(l2arc_dev_list);
3877 } else {
3878 next = list_next(l2arc_dev_list, next);
3879 if (next == NULL)
3880 next = list_head(l2arc_dev_list);
3881 }
3882
3883 /* if we have come back to the start, bail out */
3884 if (first == NULL)
3885 first = next;
3886 else if (next == first)
3887 break;
3888
3889 } while (vdev_is_dead(next->l2ad_vdev));
3890
3891 /* if we were unable to find any usable vdevs, return NULL */
3892 if (vdev_is_dead(next->l2ad_vdev))
3893 next = NULL;
3894
3895 l2arc_dev_last = next;
3896
3897 out:
3898 mutex_exit(&l2arc_dev_mtx);
3899
3900 /*
3901 * Grab the config lock to prevent the 'next' device from being
3902 * removed while we are writing to it.
3903 */
3904 if (next != NULL)
3905 spa_config_enter(next->l2ad_spa, SCL_L2ARC, next, RW_READER);
3906 mutex_exit(&spa_namespace_lock);
3907
3908 return (next);
3909 }
3910
3911 /*
3912 * Free buffers that were tagged for destruction.
3913 */
3914 static void
3915 l2arc_do_free_on_write()
3916 {
3917 list_t *buflist;
3918 l2arc_data_free_t *df, *df_prev;
3919
3920 mutex_enter(&l2arc_free_on_write_mtx);
3921 buflist = l2arc_free_on_write;
3922
3923 for (df = list_tail(buflist); df; df = df_prev) {
3924 df_prev = list_prev(buflist, df);
3925 ASSERT(df->l2df_data != NULL);
3926 ASSERT(df->l2df_func != NULL);
3927 df->l2df_func(df->l2df_data, df->l2df_size);
3928 list_remove(buflist, df);
3929 kmem_free(df, sizeof (l2arc_data_free_t));
3930 }
3931
3932 mutex_exit(&l2arc_free_on_write_mtx);
3933 }
3934
3935 /*
3936 * A write to a cache device has completed. Update all headers to allow
3937 * reads from these buffers to begin.
3938 */
3939 static void
3940 l2arc_write_done(zio_t *zio)
3941 {
3942 l2arc_write_callback_t *cb;
3943 l2arc_dev_t *dev;
3944 list_t *buflist;
3945 arc_buf_hdr_t *head, *ab, *ab_prev;
3946 l2arc_buf_hdr_t *abl2;
3947 kmutex_t *hash_lock;
3948
3949 cb = zio->io_private;
3950 ASSERT(cb != NULL);
3951 dev = cb->l2wcb_dev;
3952 ASSERT(dev != NULL);
3953 head = cb->l2wcb_head;
3954 ASSERT(head != NULL);
3955 buflist = dev->l2ad_buflist;
3956 ASSERT(buflist != NULL);
3957 DTRACE_PROBE2(l2arc__iodone, zio_t *, zio,
3958 l2arc_write_callback_t *, cb);
3959
3960 if (zio->io_error != 0)
3961 ARCSTAT_BUMP(arcstat_l2_writes_error);
3962
3963 mutex_enter(&l2arc_buflist_mtx);
3964
3965 /*
3966 * All writes completed, or an error was hit.
3967 */
3968 for (ab = list_prev(buflist, head); ab; ab = ab_prev) {
3969 ab_prev = list_prev(buflist, ab);
3970
3971 hash_lock = HDR_LOCK(ab);
3972 if (!mutex_tryenter(hash_lock)) {
3973 /*
3974 * This buffer misses out. It may be in a stage
3975 * of eviction. Its ARC_L2_WRITING flag will be
3976 * left set, denying reads to this buffer.
3977 */
3978 ARCSTAT_BUMP(arcstat_l2_writes_hdr_miss);
3979 continue;
3980 }
3981
3982 if (zio->io_error != 0) {
3983 /*
3984 * Error - drop L2ARC entry.
3985 */
3986 list_remove(buflist, ab);
3987 abl2 = ab->b_l2hdr;
3988 ab->b_l2hdr = NULL;
3989 kmem_free(abl2, sizeof (l2arc_buf_hdr_t));
3990 ARCSTAT_INCR(arcstat_l2_size, -ab->b_size);
3991 }
3992
3993 /*
3994 * Allow ARC to begin reads to this L2ARC entry.
3995 */
3996 ab->b_flags &= ~ARC_L2_WRITING;
3997
3998 mutex_exit(hash_lock);
3999 }
4000
4001 atomic_inc_64(&l2arc_writes_done);
4002 list_remove(buflist, head);
4003 kmem_cache_free(hdr_cache, head);
4004 mutex_exit(&l2arc_buflist_mtx);
4005
4006 l2arc_do_free_on_write();
4007
4008 kmem_free(cb, sizeof (l2arc_write_callback_t));
4009 }
4010
4011 /*
4012 * A read to a cache device completed. Validate buffer contents before
4013 * handing over to the regular ARC routines.
4014 */
4015 static void
4016 l2arc_read_done(zio_t *zio)
4017 {
4018 l2arc_read_callback_t *cb;
4019 arc_buf_hdr_t *hdr;
4020 arc_buf_t *buf;
4021 kmutex_t *hash_lock;
4022 int equal;
4023
4024 ASSERT(zio->io_vd != NULL);
4025 ASSERT(zio->io_flags & ZIO_FLAG_DONT_PROPAGATE);
4026
4027 spa_config_exit(zio->io_spa, SCL_L2ARC, zio->io_vd);
4028
4029 cb = zio->io_private;
4030 ASSERT(cb != NULL);
4031 buf = cb->l2rcb_buf;
4032 ASSERT(buf != NULL);
4033 hdr = buf->b_hdr;
4034 ASSERT(hdr != NULL);
4035
4036 hash_lock = HDR_LOCK(hdr);
4037 mutex_enter(hash_lock);
4038
4039 /*
4040 * Check this survived the L2ARC journey.
4041 */
4042 equal = arc_cksum_equal(buf);
4043 if (equal && zio->io_error == 0 && !HDR_L2_EVICTED(hdr)) {
4044 mutex_exit(hash_lock);
4045 zio->io_private = buf;
4046 zio->io_bp_copy = cb->l2rcb_bp; /* XXX fix in L2ARC 2.0 */
4047 zio->io_bp = &zio->io_bp_copy; /* XXX fix in L2ARC 2.0 */
4048 arc_read_done(zio);
4049 } else {
4050 mutex_exit(hash_lock);
4051 /*
4052 * Buffer didn't survive caching. Increment stats and
4053 * reissue to the original storage device.
4054 */
4055 if (zio->io_error != 0) {
4056 ARCSTAT_BUMP(arcstat_l2_io_error);
4057 } else {
4058 zio->io_error = EIO;
4059 }
4060 if (!equal)
4061 ARCSTAT_BUMP(arcstat_l2_cksum_bad);
4062
4063 /*
4064 * If there's no waiter, issue an async i/o to the primary
4065 * storage now. If there *is* a waiter, the caller must
4066 * issue the i/o in a context where it's OK to block.
4067 */
4068 if (zio->io_waiter == NULL) {
4069 zio_t *pio = zio_unique_parent(zio);
4070
4071 ASSERT(!pio || pio->io_child_type == ZIO_CHILD_LOGICAL);
4072
4073 zio_nowait(zio_read(pio, cb->l2rcb_spa, &cb->l2rcb_bp,
4074 buf->b_data, zio->io_size, arc_read_done, buf,
4075 zio->io_priority, cb->l2rcb_flags, &cb->l2rcb_zb));
4076 }
4077 }
4078
4079 kmem_free(cb, sizeof (l2arc_read_callback_t));
4080 }
4081
4082 /*
4083 * This is the list priority from which the L2ARC will search for pages to
4084 * cache. This is used within loops (0..3) to cycle through lists in the
4085 * desired order. This order can have a significant effect on cache
4086 * performance.
4087 *
4088 * Currently the metadata lists are hit first, MFU then MRU, followed by
4089 * the data lists. This function returns a locked list, and also returns
4090 * the lock pointer.
4091 */
4092 static list_t *
4093 l2arc_list_locked(int list_num, kmutex_t **lock)
4094 {
4095 list_t *list;
4096
4097 ASSERT(list_num >= 0 && list_num <= 3);
4098
4099 switch (list_num) {
4100 case 0:
4101 list = &arc_mfu->arcs_list[ARC_BUFC_METADATA];
4102 *lock = &arc_mfu->arcs_mtx;
4103 break;
4104 case 1:
4105 list = &arc_mru->arcs_list[ARC_BUFC_METADATA];
4106 *lock = &arc_mru->arcs_mtx;
4107 break;
4108 case 2:
4109 list = &arc_mfu->arcs_list[ARC_BUFC_DATA];
4110 *lock = &arc_mfu->arcs_mtx;
4111 break;
4112 case 3:
4113 list = &arc_mru->arcs_list[ARC_BUFC_DATA];
4114 *lock = &arc_mru->arcs_mtx;
4115 break;
4116 }
4117
4118 ASSERT(!(MUTEX_HELD(*lock)));
4119 mutex_enter(*lock);
4120 return (list);
4121 }
4122
4123 /*
4124 * Evict buffers from the device write hand to the distance specified in
4125 * bytes. This distance may span populated buffers, it may span nothing.
4126 * This is clearing a region on the L2ARC device ready for writing.
4127 * If the 'all' boolean is set, every buffer is evicted.
4128 */
4129 static void
4130 l2arc_evict(l2arc_dev_t *dev, uint64_t distance, boolean_t all)
4131 {
4132 list_t *buflist;
4133 l2arc_buf_hdr_t *abl2;
4134 arc_buf_hdr_t *ab, *ab_prev;
4135 kmutex_t *hash_lock;
4136 uint64_t taddr;
4137
4138 buflist = dev->l2ad_buflist;
4139
4140 if (buflist == NULL)
4141 return;
4142
4143 if (!all && dev->l2ad_first) {
4144 /*
4145 * This is the first sweep through the device. There is
4146 * nothing to evict.
4147 */
4148 return;
4149 }
4150
4151 if (dev->l2ad_hand >= (dev->l2ad_end - (2 * distance))) {
4152 /*
4153 * When nearing the end of the device, evict to the end
4154 * before the device write hand jumps to the start.
4155 */
4156 taddr = dev->l2ad_end;
4157 } else {
4158 taddr = dev->l2ad_hand + distance;
4159 }
4160 DTRACE_PROBE4(l2arc__evict, l2arc_dev_t *, dev, list_t *, buflist,
4161 uint64_t, taddr, boolean_t, all);
4162
4163 top:
4164 mutex_enter(&l2arc_buflist_mtx);
4165 for (ab = list_tail(buflist); ab; ab = ab_prev) {
4166 ab_prev = list_prev(buflist, ab);
4167
4168 hash_lock = HDR_LOCK(ab);
4169 if (!mutex_tryenter(hash_lock)) {
4170 /*
4171 * Missed the hash lock. Retry.
4172 */
4173 ARCSTAT_BUMP(arcstat_l2_evict_lock_retry);
4174 mutex_exit(&l2arc_buflist_mtx);
4175 mutex_enter(hash_lock);
4176 mutex_exit(hash_lock);
4177 goto top;
4178 }
4179
4180 if (HDR_L2_WRITE_HEAD(ab)) {
4181 /*
4182 * We hit a write head node. Leave it for
4183 * l2arc_write_done().
4184 */
4185 list_remove(buflist, ab);
4186 mutex_exit(hash_lock);
4187 continue;
4188 }
4189
4190 if (!all && ab->b_l2hdr != NULL &&
4191 (ab->b_l2hdr->b_daddr > taddr ||
4192 ab->b_l2hdr->b_daddr < dev->l2ad_hand)) {
4193 /*
4194 * We've evicted to the target address,
4195 * or the end of the device.
4196 */
4197 mutex_exit(hash_lock);
4198 break;
4199 }
4200
4201 if (HDR_FREE_IN_PROGRESS(ab)) {
4202 /*
4203 * Already on the path to destruction.
4204 */
4205 mutex_exit(hash_lock);
4206 continue;
4207 }
4208
4209 if (ab->b_state == arc_l2c_only) {
4210 ASSERT(!HDR_L2_READING(ab));
4211 /*
4212 * This doesn't exist in the ARC. Destroy.
4213 * arc_hdr_destroy() will call list_remove()
4214 * and decrement arcstat_l2_size.
4215 */
4216 arc_change_state(arc_anon, ab, hash_lock);
4217 arc_hdr_destroy(ab);
4218 } else {
4219 /*
4220 * Invalidate issued or about to be issued
4221 * reads, since we may be about to write
4222 * over this location.
4223 */
4224 if (HDR_L2_READING(ab)) {
4225 ARCSTAT_BUMP(arcstat_l2_evict_reading);
4226 ab->b_flags |= ARC_L2_EVICTED;
4227 }
4228
4229 /*
4230 * Tell ARC this no longer exists in L2ARC.
4231 */
4232 if (ab->b_l2hdr != NULL) {
4233 abl2 = ab->b_l2hdr;
4234 ab->b_l2hdr = NULL;
4235 kmem_free(abl2, sizeof (l2arc_buf_hdr_t));
4236 ARCSTAT_INCR(arcstat_l2_size, -ab->b_size);
4237 }
4238 list_remove(buflist, ab);
4239
4240 /*
4241 * This may have been leftover after a
4242 * failed write.
4243 */
4244 ab->b_flags &= ~ARC_L2_WRITING;
4245 }
4246 mutex_exit(hash_lock);
4247 }
4248 mutex_exit(&l2arc_buflist_mtx);
4249
4250 spa_l2cache_space_update(dev->l2ad_vdev, 0, -(taddr - dev->l2ad_evict));
4251 dev->l2ad_evict = taddr;
4252 }
4253
4254 /*
4255 * Find and write ARC buffers to the L2ARC device.
4256 *
4257 * An ARC_L2_WRITING flag is set so that the L2ARC buffers are not valid
4258 * for reading until they have completed writing.
4259 */
4260 static uint64_t
4261 l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz)
4262 {
4263 arc_buf_hdr_t *ab, *ab_prev, *head;
4264 l2arc_buf_hdr_t *hdrl2;
4265 list_t *list;
4266 uint64_t passed_sz, write_sz, buf_sz, headroom;
4267 void *buf_data;
4268 kmutex_t *hash_lock, *list_lock;
4269 boolean_t have_lock, full;
4270 l2arc_write_callback_t *cb;
4271 zio_t *pio, *wzio;
4272 uint64_t guid = spa_guid(spa);
4273
4274 ASSERT(dev->l2ad_vdev != NULL);
4275
4276 pio = NULL;
4277 write_sz = 0;
4278 full = B_FALSE;
4279 head = kmem_cache_alloc(hdr_cache, KM_PUSHPAGE);
4280 head->b_flags |= ARC_L2_WRITE_HEAD;
4281
4282 /*
4283 * Copy buffers for L2ARC writing.
4284 */
4285 mutex_enter(&l2arc_buflist_mtx);
4286 for (int try = 0; try <= 3; try++) {
4287 list = l2arc_list_locked(try, &list_lock);
4288 passed_sz = 0;
4289
4290 /*
4291 * L2ARC fast warmup.
4292 *
4293 * Until the ARC is warm and starts to evict, read from the
4294 * head of the ARC lists rather than the tail.
4295 */
4296 headroom = target_sz * l2arc_headroom;
4297 if (arc_warm == B_FALSE)
4298 ab = list_head(list);
4299 else
4300 ab = list_tail(list);
4301
4302 for (; ab; ab = ab_prev) {
4303 if (arc_warm == B_FALSE)
4304 ab_prev = list_next(list, ab);
4305 else
4306 ab_prev = list_prev(list, ab);
4307
4308 hash_lock = HDR_LOCK(ab);
4309 have_lock = MUTEX_HELD(hash_lock);
4310 if (!have_lock && !mutex_tryenter(hash_lock)) {
4311 /*
4312 * Skip this buffer rather than waiting.
4313 */
4314 continue;
4315 }
4316
4317 passed_sz += ab->b_size;
4318 if (passed_sz > headroom) {
4319 /*
4320 * Searched too far.
4321 */
4322 mutex_exit(hash_lock);
4323 break;
4324 }
4325
4326 if (!l2arc_write_eligible(guid, ab)) {
4327 mutex_exit(hash_lock);
4328 continue;
4329 }
4330
4331 if ((write_sz + ab->b_size) > target_sz) {
4332 full = B_TRUE;
4333 mutex_exit(hash_lock);
4334 break;
4335 }
4336
4337 if (pio == NULL) {
4338 /*
4339 * Insert a dummy header on the buflist so
4340 * l2arc_write_done() can find where the
4341 * write buffers begin without searching.
4342 */
4343 list_insert_head(dev->l2ad_buflist, head);
4344
4345 cb = kmem_alloc(
4346 sizeof (l2arc_write_callback_t), KM_SLEEP);
4347 cb->l2wcb_dev = dev;
4348 cb->l2wcb_head = head;
4349 pio = zio_root(spa, l2arc_write_done, cb,
4350 ZIO_FLAG_CANFAIL);
4351 }
4352
4353 /*
4354 * Create and add a new L2ARC header.
4355 */
4356 hdrl2 = kmem_zalloc(sizeof (l2arc_buf_hdr_t), KM_SLEEP);
4357 hdrl2->b_dev = dev;
4358 hdrl2->b_daddr = dev->l2ad_hand;
4359
4360 ab->b_flags |= ARC_L2_WRITING;
4361 ab->b_l2hdr = hdrl2;
4362 list_insert_head(dev->l2ad_buflist, ab);
4363 buf_data = ab->b_buf->b_data;
4364 buf_sz = ab->b_size;
4365
4366 /*
4367 * Compute and store the buffer cksum before
4368 * writing. On debug the cksum is verified first.
4369 */
4370 arc_cksum_verify(ab->b_buf);
4371 arc_cksum_compute(ab->b_buf, B_TRUE);
4372
4373 mutex_exit(hash_lock);
4374
4375 wzio = zio_write_phys(pio, dev->l2ad_vdev,
4376 dev->l2ad_hand, buf_sz, buf_data, ZIO_CHECKSUM_OFF,
4377 NULL, NULL, ZIO_PRIORITY_ASYNC_WRITE,
4378 ZIO_FLAG_CANFAIL, B_FALSE);
4379
4380 DTRACE_PROBE2(l2arc__write, vdev_t *, dev->l2ad_vdev,
4381 zio_t *, wzio);
4382 (void) zio_nowait(wzio);
4383
4384 /*
4385 * Keep the clock hand suitably device-aligned.
4386 */
4387 buf_sz = vdev_psize_to_asize(dev->l2ad_vdev, buf_sz);
4388
4389 write_sz += buf_sz;
4390 dev->l2ad_hand += buf_sz;
4391 }
4392
4393 mutex_exit(list_lock);
4394
4395 if (full == B_TRUE)
4396 break;
4397 }
4398 mutex_exit(&l2arc_buflist_mtx);
4399
4400 if (pio == NULL) {
4401 ASSERT3U(write_sz, ==, 0);
4402 kmem_cache_free(hdr_cache, head);
4403 return (0);
4404 }
4405
4406 ASSERT3U(write_sz, <=, target_sz);
4407 ARCSTAT_BUMP(arcstat_l2_writes_sent);
4408 ARCSTAT_INCR(arcstat_l2_write_bytes, write_sz);
4409 ARCSTAT_INCR(arcstat_l2_size, write_sz);
4410 spa_l2cache_space_update(dev->l2ad_vdev, 0, write_sz);
4411
4412 /*
4413 * Bump device hand to the device start if it is approaching the end.
4414 * l2arc_evict() will already have evicted ahead for this case.
4415 */
4416 if (dev->l2ad_hand >= (dev->l2ad_end - target_sz)) {
4417 spa_l2cache_space_update(dev->l2ad_vdev, 0,
4418 dev->l2ad_end - dev->l2ad_hand);
4419 dev->l2ad_hand = dev->l2ad_start;
4420 dev->l2ad_evict = dev->l2ad_start;
4421 dev->l2ad_first = B_FALSE;
4422 }
4423
4424 dev->l2ad_writing = B_TRUE;
4425 (void) zio_wait(pio);
4426 dev->l2ad_writing = B_FALSE;
4427
4428 return (write_sz);
4429 }
4430
4431 /*
4432 * This thread feeds the L2ARC at regular intervals. This is the beating
4433 * heart of the L2ARC.
4434 */
4435 static void
4436 l2arc_feed_thread(void)
4437 {
4438 callb_cpr_t cpr;
4439 l2arc_dev_t *dev;
4440 spa_t *spa;
4441 uint64_t size, wrote;
4442 clock_t begin, next = lbolt;
4443
4444 CALLB_CPR_INIT(&cpr, &l2arc_feed_thr_lock, callb_generic_cpr, FTAG);
4445
4446 mutex_enter(&l2arc_feed_thr_lock);
4447
4448 while (l2arc_thread_exit == 0) {
4449 CALLB_CPR_SAFE_BEGIN(&cpr);
4450 (void) cv_timedwait(&l2arc_feed_thr_cv, &l2arc_feed_thr_lock,
4451 next);
4452 CALLB_CPR_SAFE_END(&cpr, &l2arc_feed_thr_lock);
4453 next = lbolt + hz;
4454
4455 /*
4456 * Quick check for L2ARC devices.
4457 */
4458 mutex_enter(&l2arc_dev_mtx);
4459 if (l2arc_ndev == 0) {
4460 mutex_exit(&l2arc_dev_mtx);
4461 continue;
4462 }
4463 mutex_exit(&l2arc_dev_mtx);
4464 begin = lbolt;
4465
4466 /*
4467 * This selects the next l2arc device to write to, and in
4468 * doing so the next spa to feed from: dev->l2ad_spa. This
4469 * will return NULL if there are now no l2arc devices or if
4470 * they are all faulted.
4471 *
4472 * If a device is returned, its spa's config lock is also
4473 * held to prevent device removal. l2arc_dev_get_next()
4474 * will grab and release l2arc_dev_mtx.
4475 */
4476 if ((dev = l2arc_dev_get_next()) == NULL)
4477 continue;
4478
4479 spa = dev->l2ad_spa;
4480 ASSERT(spa != NULL);
4481
4482 /*
4483 * Avoid contributing to memory pressure.
4484 */
4485 if (arc_reclaim_needed()) {
4486 ARCSTAT_BUMP(arcstat_l2_abort_lowmem);
4487 spa_config_exit(spa, SCL_L2ARC, dev);
4488 continue;
4489 }
4490
4491 ARCSTAT_BUMP(arcstat_l2_feeds);
4492
4493 size = l2arc_write_size(dev);
4494
4495 /*
4496 * Evict L2ARC buffers that will be overwritten.
4497 */
4498 l2arc_evict(dev, size, B_FALSE);
4499
4500 /*
4501 * Write ARC buffers.
4502 */
4503 wrote = l2arc_write_buffers(spa, dev, size);
4504
4505 /*
4506 * Calculate interval between writes.
4507 */
4508 next = l2arc_write_interval(begin, size, wrote);
4509 spa_config_exit(spa, SCL_L2ARC, dev);
4510 }
4511
4512 l2arc_thread_exit = 0;
4513 cv_broadcast(&l2arc_feed_thr_cv);
4514 CALLB_CPR_EXIT(&cpr); /* drops l2arc_feed_thr_lock */
4515 thread_exit();
4516 }
4517
4518 boolean_t
4519 l2arc_vdev_present(vdev_t *vd)
4520 {
4521 l2arc_dev_t *dev;
4522
4523 mutex_enter(&l2arc_dev_mtx);
4524 for (dev = list_head(l2arc_dev_list); dev != NULL;
4525 dev = list_next(l2arc_dev_list, dev)) {
4526 if (dev->l2ad_vdev == vd)
4527 break;
4528 }
4529 mutex_exit(&l2arc_dev_mtx);
4530
4531 return (dev != NULL);
4532 }
4533
4534 /*
4535 * Add a vdev for use by the L2ARC. By this point the spa has already
4536 * validated the vdev and opened it.
4537 */
4538 void
4539 l2arc_add_vdev(spa_t *spa, vdev_t *vd)
4540 {
4541 l2arc_dev_t *adddev;
4542
4543 ASSERT(!l2arc_vdev_present(vd));
4544
4545 /*
4546 * Create a new l2arc device entry.
4547 */
4548 adddev = kmem_zalloc(sizeof (l2arc_dev_t), KM_SLEEP);
4549 adddev->l2ad_spa = spa;
4550 adddev->l2ad_vdev = vd;
4551 adddev->l2ad_write = l2arc_write_max;
4552 adddev->l2ad_boost = l2arc_write_boost;
4553 adddev->l2ad_start = VDEV_LABEL_START_SIZE;
4554 adddev->l2ad_end = VDEV_LABEL_START_SIZE + vdev_get_min_asize(vd);
4555 adddev->l2ad_hand = adddev->l2ad_start;
4556 adddev->l2ad_evict = adddev->l2ad_start;
4557 adddev->l2ad_first = B_TRUE;
4558 adddev->l2ad_writing = B_FALSE;
4559 ASSERT3U(adddev->l2ad_write, >, 0);
4560
4561 /*
4562 * This is a list of all ARC buffers that are still valid on the
4563 * device.
4564 */
4565 adddev->l2ad_buflist = kmem_zalloc(sizeof (list_t), KM_SLEEP);
4566 list_create(adddev->l2ad_buflist, sizeof (arc_buf_hdr_t),
4567 offsetof(arc_buf_hdr_t, b_l2node));
4568
4569 spa_l2cache_space_update(vd, adddev->l2ad_end - adddev->l2ad_hand, 0);
4570
4571 /*
4572 * Add device to global list
4573 */
4574 mutex_enter(&l2arc_dev_mtx);
4575 list_insert_head(l2arc_dev_list, adddev);
4576 atomic_inc_64(&l2arc_ndev);
4577 mutex_exit(&l2arc_dev_mtx);
4578 }
4579
4580 /*
4581 * Remove a vdev from the L2ARC.
4582 */
4583 void
4584 l2arc_remove_vdev(vdev_t *vd)
4585 {
4586 l2arc_dev_t *dev, *nextdev, *remdev = NULL;
4587
4588 /*
4589 * Find the device by vdev
4590 */
4591 mutex_enter(&l2arc_dev_mtx);
4592 for (dev = list_head(l2arc_dev_list); dev; dev = nextdev) {
4593 nextdev = list_next(l2arc_dev_list, dev);
4594 if (vd == dev->l2ad_vdev) {
4595 remdev = dev;
4596 break;
4597 }
4598 }
4599 ASSERT(remdev != NULL);
4600
4601 /*
4602 * Remove device from global list
4603 */
4604 list_remove(l2arc_dev_list, remdev);
4605 l2arc_dev_last = NULL; /* may have been invalidated */
4606 atomic_dec_64(&l2arc_ndev);
4607 mutex_exit(&l2arc_dev_mtx);
4608
4609 /*
4610 * Clear all buflists and ARC references. L2ARC device flush.
4611 */
4612 l2arc_evict(remdev, 0, B_TRUE);
4613 list_destroy(remdev->l2ad_buflist);
4614 kmem_free(remdev->l2ad_buflist, sizeof (list_t));
4615 kmem_free(remdev, sizeof (l2arc_dev_t));
4616 }
4617
4618 void
4619 l2arc_init(void)
4620 {
4621 l2arc_thread_exit = 0;
4622 l2arc_ndev = 0;
4623 l2arc_writes_sent = 0;
4624 l2arc_writes_done = 0;
4625
4626 mutex_init(&l2arc_feed_thr_lock, NULL, MUTEX_DEFAULT, NULL);
4627 cv_init(&l2arc_feed_thr_cv, NULL, CV_DEFAULT, NULL);
4628 mutex_init(&l2arc_dev_mtx, NULL, MUTEX_DEFAULT, NULL);
4629 mutex_init(&l2arc_buflist_mtx, NULL, MUTEX_DEFAULT, NULL);
4630 mutex_init(&l2arc_free_on_write_mtx, NULL, MUTEX_DEFAULT, NULL);
4631
4632 l2arc_dev_list = &L2ARC_dev_list;
4633 l2arc_free_on_write = &L2ARC_free_on_write;
4634 list_create(l2arc_dev_list, sizeof (l2arc_dev_t),
4635 offsetof(l2arc_dev_t, l2ad_node));
4636 list_create(l2arc_free_on_write, sizeof (l2arc_data_free_t),
4637 offsetof(l2arc_data_free_t, l2df_list_node));
4638 }
4639
4640 void
4641 l2arc_fini(void)
4642 {
4643 /*
4644 * This is called from dmu_fini(), which is called from spa_fini();
4645 * Because of this, we can assume that all l2arc devices have
4646 * already been removed when the pools themselves were removed.
4647 */
4648
4649 l2arc_do_free_on_write();
4650
4651 mutex_destroy(&l2arc_feed_thr_lock);
4652 cv_destroy(&l2arc_feed_thr_cv);
4653 mutex_destroy(&l2arc_dev_mtx);
4654 mutex_destroy(&l2arc_buflist_mtx);
4655 mutex_destroy(&l2arc_free_on_write_mtx);
4656
4657 list_destroy(l2arc_dev_list);
4658 list_destroy(l2arc_free_on_write);
4659 }
4660
4661 void
4662 l2arc_start(void)
4663 {
4664 if (!(spa_mode_global & FWRITE))
4665 return;
4666
4667 (void) thread_create(NULL, 0, l2arc_feed_thread, NULL, 0, &p0,
4668 TS_RUN, minclsyspri);
4669 }
4670
4671 void
4672 l2arc_stop(void)
4673 {
4674 if (!(spa_mode_global & FWRITE))
4675 return;
4676
4677 mutex_enter(&l2arc_feed_thr_lock);
4678 cv_signal(&l2arc_feed_thr_cv); /* kick thread out of startup */
4679 l2arc_thread_exit = 1;
4680 while (l2arc_thread_exit != 0)
4681 cv_wait(&l2arc_feed_thr_cv, &l2arc_feed_thr_lock);
4682 mutex_exit(&l2arc_feed_thr_lock);
4683 }