]> git.proxmox.com Git - mirror_zfs.git/blob - module/zfs/arc.c
Illumos #3137 L2ARC compression
[mirror_zfs.git] / module / zfs / arc.c
1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /*
22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23 * Copyright 2011 Nexenta Systems, Inc. All rights reserved.
24 * Copyright (c) 2011 by Delphix. All rights reserved.
25 * Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
26 */
27
28 /*
29 * DVA-based Adjustable Replacement Cache
30 *
31 * While much of the theory of operation used here is
32 * based on the self-tuning, low overhead replacement cache
33 * presented by Megiddo and Modha at FAST 2003, there are some
34 * significant differences:
35 *
36 * 1. The Megiddo and Modha model assumes any page is evictable.
37 * Pages in its cache cannot be "locked" into memory. This makes
38 * the eviction algorithm simple: evict the last page in the list.
39 * This also make the performance characteristics easy to reason
40 * about. Our cache is not so simple. At any given moment, some
41 * subset of the blocks in the cache are un-evictable because we
42 * have handed out a reference to them. Blocks are only evictable
43 * when there are no external references active. This makes
44 * eviction far more problematic: we choose to evict the evictable
45 * blocks that are the "lowest" in the list.
46 *
47 * There are times when it is not possible to evict the requested
48 * space. In these circumstances we are unable to adjust the cache
49 * size. To prevent the cache growing unbounded at these times we
50 * implement a "cache throttle" that slows the flow of new data
51 * into the cache until we can make space available.
52 *
53 * 2. The Megiddo and Modha model assumes a fixed cache size.
54 * Pages are evicted when the cache is full and there is a cache
55 * miss. Our model has a variable sized cache. It grows with
56 * high use, but also tries to react to memory pressure from the
57 * operating system: decreasing its size when system memory is
58 * tight.
59 *
60 * 3. The Megiddo and Modha model assumes a fixed page size. All
61 * elements of the cache are therefor exactly the same size. So
62 * when adjusting the cache size following a cache miss, its simply
63 * a matter of choosing a single page to evict. In our model, we
64 * have variable sized cache blocks (rangeing from 512 bytes to
65 * 128K bytes). We therefor choose a set of blocks to evict to make
66 * space for a cache miss that approximates as closely as possible
67 * the space used by the new block.
68 *
69 * See also: "ARC: A Self-Tuning, Low Overhead Replacement Cache"
70 * by N. Megiddo & D. Modha, FAST 2003
71 */
72
73 /*
74 * The locking model:
75 *
76 * A new reference to a cache buffer can be obtained in two
77 * ways: 1) via a hash table lookup using the DVA as a key,
78 * or 2) via one of the ARC lists. The arc_read() interface
79 * uses method 1, while the internal arc algorithms for
80 * adjusting the cache use method 2. We therefor provide two
81 * types of locks: 1) the hash table lock array, and 2) the
82 * arc list locks.
83 *
84 * Buffers do not have their own mutexes, rather they rely on the
85 * hash table mutexes for the bulk of their protection (i.e. most
86 * fields in the arc_buf_hdr_t are protected by these mutexes).
87 *
88 * buf_hash_find() returns the appropriate mutex (held) when it
89 * locates the requested buffer in the hash table. It returns
90 * NULL for the mutex if the buffer was not in the table.
91 *
92 * buf_hash_remove() expects the appropriate hash mutex to be
93 * already held before it is invoked.
94 *
95 * Each arc state also has a mutex which is used to protect the
96 * buffer list associated with the state. When attempting to
97 * obtain a hash table lock while holding an arc list lock you
98 * must use: mutex_tryenter() to avoid deadlock. Also note that
99 * the active state mutex must be held before the ghost state mutex.
100 *
101 * Arc buffers may have an associated eviction callback function.
102 * This function will be invoked prior to removing the buffer (e.g.
103 * in arc_do_user_evicts()). Note however that the data associated
104 * with the buffer may be evicted prior to the callback. The callback
105 * must be made with *no locks held* (to prevent deadlock). Additionally,
106 * the users of callbacks must ensure that their private data is
107 * protected from simultaneous callbacks from arc_buf_evict()
108 * and arc_do_user_evicts().
109 *
110 * It as also possible to register a callback which is run when the
111 * arc_meta_limit is reached and no buffers can be safely evicted. In
112 * this case the arc user should drop a reference on some arc buffers so
113 * they can be reclaimed and the arc_meta_limit honored. For example,
114 * when using the ZPL each dentry holds a references on a znode. These
115 * dentries must be pruned before the arc buffer holding the znode can
116 * be safely evicted.
117 *
118 * Note that the majority of the performance stats are manipulated
119 * with atomic operations.
120 *
121 * The L2ARC uses the l2arc_buflist_mtx global mutex for the following:
122 *
123 * - L2ARC buflist creation
124 * - L2ARC buflist eviction
125 * - L2ARC write completion, which walks L2ARC buflists
126 * - ARC header destruction, as it removes from L2ARC buflists
127 * - ARC header release, as it removes from L2ARC buflists
128 */
129
130 #include <sys/spa.h>
131 #include <sys/zio.h>
132 #include <sys/zio_compress.h>
133 #include <sys/zfs_context.h>
134 #include <sys/arc.h>
135 #include <sys/vdev.h>
136 #include <sys/vdev_impl.h>
137 #ifdef _KERNEL
138 #include <sys/vmsystm.h>
139 #include <vm/anon.h>
140 #include <sys/fs/swapnode.h>
141 #include <sys/zpl.h>
142 #endif
143 #include <sys/callb.h>
144 #include <sys/kstat.h>
145 #include <sys/dmu_tx.h>
146 #include <zfs_fletcher.h>
147
148 static kmutex_t arc_reclaim_thr_lock;
149 static kcondvar_t arc_reclaim_thr_cv; /* used to signal reclaim thr */
150 static uint8_t arc_thread_exit;
151
152 /* number of bytes to prune from caches when at arc_meta_limit is reached */
153 int zfs_arc_meta_prune = 1048576;
154
155 typedef enum arc_reclaim_strategy {
156 ARC_RECLAIM_AGGR, /* Aggressive reclaim strategy */
157 ARC_RECLAIM_CONS /* Conservative reclaim strategy */
158 } arc_reclaim_strategy_t;
159
160 /* number of seconds before growing cache again */
161 int zfs_arc_grow_retry = 5;
162
163 /* shift of arc_c for calculating both min and max arc_p */
164 int zfs_arc_p_min_shift = 4;
165
166 /* log2(fraction of arc to reclaim) */
167 int zfs_arc_shrink_shift = 5;
168
169 /*
170 * minimum lifespan of a prefetch block in clock ticks
171 * (initialized in arc_init())
172 */
173 int zfs_arc_min_prefetch_lifespan = HZ;
174
175 /* disable arc proactive arc throttle due to low memory */
176 int zfs_arc_memory_throttle_disable = 1;
177
178 /* disable duplicate buffer eviction */
179 int zfs_disable_dup_eviction = 0;
180
181 static int arc_dead;
182
183 /* expiration time for arc_no_grow */
184 static clock_t arc_grow_time = 0;
185
186 /*
187 * The arc has filled available memory and has now warmed up.
188 */
189 static boolean_t arc_warm;
190
191 /*
192 * These tunables are for performance analysis.
193 */
194 unsigned long zfs_arc_max = 0;
195 unsigned long zfs_arc_min = 0;
196 unsigned long zfs_arc_meta_limit = 0;
197
198 /*
199 * Note that buffers can be in one of 6 states:
200 * ARC_anon - anonymous (discussed below)
201 * ARC_mru - recently used, currently cached
202 * ARC_mru_ghost - recentely used, no longer in cache
203 * ARC_mfu - frequently used, currently cached
204 * ARC_mfu_ghost - frequently used, no longer in cache
205 * ARC_l2c_only - exists in L2ARC but not other states
206 * When there are no active references to the buffer, they are
207 * are linked onto a list in one of these arc states. These are
208 * the only buffers that can be evicted or deleted. Within each
209 * state there are multiple lists, one for meta-data and one for
210 * non-meta-data. Meta-data (indirect blocks, blocks of dnodes,
211 * etc.) is tracked separately so that it can be managed more
212 * explicitly: favored over data, limited explicitly.
213 *
214 * Anonymous buffers are buffers that are not associated with
215 * a DVA. These are buffers that hold dirty block copies
216 * before they are written to stable storage. By definition,
217 * they are "ref'd" and are considered part of arc_mru
218 * that cannot be freed. Generally, they will aquire a DVA
219 * as they are written and migrate onto the arc_mru list.
220 *
221 * The ARC_l2c_only state is for buffers that are in the second
222 * level ARC but no longer in any of the ARC_m* lists. The second
223 * level ARC itself may also contain buffers that are in any of
224 * the ARC_m* states - meaning that a buffer can exist in two
225 * places. The reason for the ARC_l2c_only state is to keep the
226 * buffer header in the hash table, so that reads that hit the
227 * second level ARC benefit from these fast lookups.
228 */
229
230 typedef struct arc_state {
231 list_t arcs_list[ARC_BUFC_NUMTYPES]; /* list of evictable buffers */
232 uint64_t arcs_lsize[ARC_BUFC_NUMTYPES]; /* amount of evictable data */
233 uint64_t arcs_size; /* total amount of data in this state */
234 kmutex_t arcs_mtx;
235 } arc_state_t;
236
237 /* The 6 states: */
238 static arc_state_t ARC_anon;
239 static arc_state_t ARC_mru;
240 static arc_state_t ARC_mru_ghost;
241 static arc_state_t ARC_mfu;
242 static arc_state_t ARC_mfu_ghost;
243 static arc_state_t ARC_l2c_only;
244
245 typedef struct arc_stats {
246 kstat_named_t arcstat_hits;
247 kstat_named_t arcstat_misses;
248 kstat_named_t arcstat_demand_data_hits;
249 kstat_named_t arcstat_demand_data_misses;
250 kstat_named_t arcstat_demand_metadata_hits;
251 kstat_named_t arcstat_demand_metadata_misses;
252 kstat_named_t arcstat_prefetch_data_hits;
253 kstat_named_t arcstat_prefetch_data_misses;
254 kstat_named_t arcstat_prefetch_metadata_hits;
255 kstat_named_t arcstat_prefetch_metadata_misses;
256 kstat_named_t arcstat_mru_hits;
257 kstat_named_t arcstat_mru_ghost_hits;
258 kstat_named_t arcstat_mfu_hits;
259 kstat_named_t arcstat_mfu_ghost_hits;
260 kstat_named_t arcstat_deleted;
261 kstat_named_t arcstat_recycle_miss;
262 kstat_named_t arcstat_mutex_miss;
263 kstat_named_t arcstat_evict_skip;
264 kstat_named_t arcstat_evict_l2_cached;
265 kstat_named_t arcstat_evict_l2_eligible;
266 kstat_named_t arcstat_evict_l2_ineligible;
267 kstat_named_t arcstat_hash_elements;
268 kstat_named_t arcstat_hash_elements_max;
269 kstat_named_t arcstat_hash_collisions;
270 kstat_named_t arcstat_hash_chains;
271 kstat_named_t arcstat_hash_chain_max;
272 kstat_named_t arcstat_p;
273 kstat_named_t arcstat_c;
274 kstat_named_t arcstat_c_min;
275 kstat_named_t arcstat_c_max;
276 kstat_named_t arcstat_size;
277 kstat_named_t arcstat_hdr_size;
278 kstat_named_t arcstat_data_size;
279 kstat_named_t arcstat_other_size;
280 kstat_named_t arcstat_anon_size;
281 kstat_named_t arcstat_anon_evict_data;
282 kstat_named_t arcstat_anon_evict_metadata;
283 kstat_named_t arcstat_mru_size;
284 kstat_named_t arcstat_mru_evict_data;
285 kstat_named_t arcstat_mru_evict_metadata;
286 kstat_named_t arcstat_mru_ghost_size;
287 kstat_named_t arcstat_mru_ghost_evict_data;
288 kstat_named_t arcstat_mru_ghost_evict_metadata;
289 kstat_named_t arcstat_mfu_size;
290 kstat_named_t arcstat_mfu_evict_data;
291 kstat_named_t arcstat_mfu_evict_metadata;
292 kstat_named_t arcstat_mfu_ghost_size;
293 kstat_named_t arcstat_mfu_ghost_evict_data;
294 kstat_named_t arcstat_mfu_ghost_evict_metadata;
295 kstat_named_t arcstat_l2_hits;
296 kstat_named_t arcstat_l2_misses;
297 kstat_named_t arcstat_l2_feeds;
298 kstat_named_t arcstat_l2_rw_clash;
299 kstat_named_t arcstat_l2_read_bytes;
300 kstat_named_t arcstat_l2_write_bytes;
301 kstat_named_t arcstat_l2_writes_sent;
302 kstat_named_t arcstat_l2_writes_done;
303 kstat_named_t arcstat_l2_writes_error;
304 kstat_named_t arcstat_l2_writes_hdr_miss;
305 kstat_named_t arcstat_l2_evict_lock_retry;
306 kstat_named_t arcstat_l2_evict_reading;
307 kstat_named_t arcstat_l2_free_on_write;
308 kstat_named_t arcstat_l2_abort_lowmem;
309 kstat_named_t arcstat_l2_cksum_bad;
310 kstat_named_t arcstat_l2_io_error;
311 kstat_named_t arcstat_l2_size;
312 kstat_named_t arcstat_l2_asize;
313 kstat_named_t arcstat_l2_hdr_size;
314 kstat_named_t arcstat_l2_compress_successes;
315 kstat_named_t arcstat_l2_compress_zeros;
316 kstat_named_t arcstat_l2_compress_failures;
317 kstat_named_t arcstat_memory_throttle_count;
318 kstat_named_t arcstat_duplicate_buffers;
319 kstat_named_t arcstat_duplicate_buffers_size;
320 kstat_named_t arcstat_duplicate_reads;
321 kstat_named_t arcstat_memory_direct_count;
322 kstat_named_t arcstat_memory_indirect_count;
323 kstat_named_t arcstat_no_grow;
324 kstat_named_t arcstat_tempreserve;
325 kstat_named_t arcstat_loaned_bytes;
326 kstat_named_t arcstat_prune;
327 kstat_named_t arcstat_meta_used;
328 kstat_named_t arcstat_meta_limit;
329 kstat_named_t arcstat_meta_max;
330 } arc_stats_t;
331
332 static arc_stats_t arc_stats = {
333 { "hits", KSTAT_DATA_UINT64 },
334 { "misses", KSTAT_DATA_UINT64 },
335 { "demand_data_hits", KSTAT_DATA_UINT64 },
336 { "demand_data_misses", KSTAT_DATA_UINT64 },
337 { "demand_metadata_hits", KSTAT_DATA_UINT64 },
338 { "demand_metadata_misses", KSTAT_DATA_UINT64 },
339 { "prefetch_data_hits", KSTAT_DATA_UINT64 },
340 { "prefetch_data_misses", KSTAT_DATA_UINT64 },
341 { "prefetch_metadata_hits", KSTAT_DATA_UINT64 },
342 { "prefetch_metadata_misses", KSTAT_DATA_UINT64 },
343 { "mru_hits", KSTAT_DATA_UINT64 },
344 { "mru_ghost_hits", KSTAT_DATA_UINT64 },
345 { "mfu_hits", KSTAT_DATA_UINT64 },
346 { "mfu_ghost_hits", KSTAT_DATA_UINT64 },
347 { "deleted", KSTAT_DATA_UINT64 },
348 { "recycle_miss", KSTAT_DATA_UINT64 },
349 { "mutex_miss", KSTAT_DATA_UINT64 },
350 { "evict_skip", KSTAT_DATA_UINT64 },
351 { "evict_l2_cached", KSTAT_DATA_UINT64 },
352 { "evict_l2_eligible", KSTAT_DATA_UINT64 },
353 { "evict_l2_ineligible", KSTAT_DATA_UINT64 },
354 { "hash_elements", KSTAT_DATA_UINT64 },
355 { "hash_elements_max", KSTAT_DATA_UINT64 },
356 { "hash_collisions", KSTAT_DATA_UINT64 },
357 { "hash_chains", KSTAT_DATA_UINT64 },
358 { "hash_chain_max", KSTAT_DATA_UINT64 },
359 { "p", KSTAT_DATA_UINT64 },
360 { "c", KSTAT_DATA_UINT64 },
361 { "c_min", KSTAT_DATA_UINT64 },
362 { "c_max", KSTAT_DATA_UINT64 },
363 { "size", KSTAT_DATA_UINT64 },
364 { "hdr_size", KSTAT_DATA_UINT64 },
365 { "data_size", KSTAT_DATA_UINT64 },
366 { "other_size", KSTAT_DATA_UINT64 },
367 { "anon_size", KSTAT_DATA_UINT64 },
368 { "anon_evict_data", KSTAT_DATA_UINT64 },
369 { "anon_evict_metadata", KSTAT_DATA_UINT64 },
370 { "mru_size", KSTAT_DATA_UINT64 },
371 { "mru_evict_data", KSTAT_DATA_UINT64 },
372 { "mru_evict_metadata", KSTAT_DATA_UINT64 },
373 { "mru_ghost_size", KSTAT_DATA_UINT64 },
374 { "mru_ghost_evict_data", KSTAT_DATA_UINT64 },
375 { "mru_ghost_evict_metadata", KSTAT_DATA_UINT64 },
376 { "mfu_size", KSTAT_DATA_UINT64 },
377 { "mfu_evict_data", KSTAT_DATA_UINT64 },
378 { "mfu_evict_metadata", KSTAT_DATA_UINT64 },
379 { "mfu_ghost_size", KSTAT_DATA_UINT64 },
380 { "mfu_ghost_evict_data", KSTAT_DATA_UINT64 },
381 { "mfu_ghost_evict_metadata", KSTAT_DATA_UINT64 },
382 { "l2_hits", KSTAT_DATA_UINT64 },
383 { "l2_misses", KSTAT_DATA_UINT64 },
384 { "l2_feeds", KSTAT_DATA_UINT64 },
385 { "l2_rw_clash", KSTAT_DATA_UINT64 },
386 { "l2_read_bytes", KSTAT_DATA_UINT64 },
387 { "l2_write_bytes", KSTAT_DATA_UINT64 },
388 { "l2_writes_sent", KSTAT_DATA_UINT64 },
389 { "l2_writes_done", KSTAT_DATA_UINT64 },
390 { "l2_writes_error", KSTAT_DATA_UINT64 },
391 { "l2_writes_hdr_miss", KSTAT_DATA_UINT64 },
392 { "l2_evict_lock_retry", KSTAT_DATA_UINT64 },
393 { "l2_evict_reading", KSTAT_DATA_UINT64 },
394 { "l2_free_on_write", KSTAT_DATA_UINT64 },
395 { "l2_abort_lowmem", KSTAT_DATA_UINT64 },
396 { "l2_cksum_bad", KSTAT_DATA_UINT64 },
397 { "l2_io_error", KSTAT_DATA_UINT64 },
398 { "l2_size", KSTAT_DATA_UINT64 },
399 { "l2_asize", KSTAT_DATA_UINT64 },
400 { "l2_hdr_size", KSTAT_DATA_UINT64 },
401 { "l2_compress_successes", KSTAT_DATA_UINT64 },
402 { "l2_compress_zeros", KSTAT_DATA_UINT64 },
403 { "l2_compress_failures", KSTAT_DATA_UINT64 },
404 { "memory_throttle_count", KSTAT_DATA_UINT64 },
405 { "duplicate_buffers", KSTAT_DATA_UINT64 },
406 { "duplicate_buffers_size", KSTAT_DATA_UINT64 },
407 { "duplicate_reads", KSTAT_DATA_UINT64 },
408 { "memory_direct_count", KSTAT_DATA_UINT64 },
409 { "memory_indirect_count", KSTAT_DATA_UINT64 },
410 { "arc_no_grow", KSTAT_DATA_UINT64 },
411 { "arc_tempreserve", KSTAT_DATA_UINT64 },
412 { "arc_loaned_bytes", KSTAT_DATA_UINT64 },
413 { "arc_prune", KSTAT_DATA_UINT64 },
414 { "arc_meta_used", KSTAT_DATA_UINT64 },
415 { "arc_meta_limit", KSTAT_DATA_UINT64 },
416 { "arc_meta_max", KSTAT_DATA_UINT64 },
417 };
418
419 #define ARCSTAT(stat) (arc_stats.stat.value.ui64)
420
421 #define ARCSTAT_INCR(stat, val) \
422 atomic_add_64(&arc_stats.stat.value.ui64, (val));
423
424 #define ARCSTAT_BUMP(stat) ARCSTAT_INCR(stat, 1)
425 #define ARCSTAT_BUMPDOWN(stat) ARCSTAT_INCR(stat, -1)
426
427 #define ARCSTAT_MAX(stat, val) { \
428 uint64_t m; \
429 while ((val) > (m = arc_stats.stat.value.ui64) && \
430 (m != atomic_cas_64(&arc_stats.stat.value.ui64, m, (val)))) \
431 continue; \
432 }
433
434 #define ARCSTAT_MAXSTAT(stat) \
435 ARCSTAT_MAX(stat##_max, arc_stats.stat.value.ui64)
436
437 /*
438 * We define a macro to allow ARC hits/misses to be easily broken down by
439 * two separate conditions, giving a total of four different subtypes for
440 * each of hits and misses (so eight statistics total).
441 */
442 #define ARCSTAT_CONDSTAT(cond1, stat1, notstat1, cond2, stat2, notstat2, stat) \
443 if (cond1) { \
444 if (cond2) { \
445 ARCSTAT_BUMP(arcstat_##stat1##_##stat2##_##stat); \
446 } else { \
447 ARCSTAT_BUMP(arcstat_##stat1##_##notstat2##_##stat); \
448 } \
449 } else { \
450 if (cond2) { \
451 ARCSTAT_BUMP(arcstat_##notstat1##_##stat2##_##stat); \
452 } else { \
453 ARCSTAT_BUMP(arcstat_##notstat1##_##notstat2##_##stat);\
454 } \
455 }
456
457 kstat_t *arc_ksp;
458 static arc_state_t *arc_anon;
459 static arc_state_t *arc_mru;
460 static arc_state_t *arc_mru_ghost;
461 static arc_state_t *arc_mfu;
462 static arc_state_t *arc_mfu_ghost;
463 static arc_state_t *arc_l2c_only;
464
465 /*
466 * There are several ARC variables that are critical to export as kstats --
467 * but we don't want to have to grovel around in the kstat whenever we wish to
468 * manipulate them. For these variables, we therefore define them to be in
469 * terms of the statistic variable. This assures that we are not introducing
470 * the possibility of inconsistency by having shadow copies of the variables,
471 * while still allowing the code to be readable.
472 */
473 #define arc_size ARCSTAT(arcstat_size) /* actual total arc size */
474 #define arc_p ARCSTAT(arcstat_p) /* target size of MRU */
475 #define arc_c ARCSTAT(arcstat_c) /* target size of cache */
476 #define arc_c_min ARCSTAT(arcstat_c_min) /* min target cache size */
477 #define arc_c_max ARCSTAT(arcstat_c_max) /* max target cache size */
478 #define arc_no_grow ARCSTAT(arcstat_no_grow)
479 #define arc_tempreserve ARCSTAT(arcstat_tempreserve)
480 #define arc_loaned_bytes ARCSTAT(arcstat_loaned_bytes)
481 #define arc_meta_used ARCSTAT(arcstat_meta_used)
482 #define arc_meta_limit ARCSTAT(arcstat_meta_limit)
483 #define arc_meta_max ARCSTAT(arcstat_meta_max)
484
485 #define L2ARC_IS_VALID_COMPRESS(_c_) \
486 ((_c_) == ZIO_COMPRESS_LZ4 || (_c_) == ZIO_COMPRESS_EMPTY)
487
488 typedef struct l2arc_buf_hdr l2arc_buf_hdr_t;
489
490 typedef struct arc_callback arc_callback_t;
491
492 struct arc_callback {
493 void *acb_private;
494 arc_done_func_t *acb_done;
495 arc_buf_t *acb_buf;
496 zio_t *acb_zio_dummy;
497 arc_callback_t *acb_next;
498 };
499
500 typedef struct arc_write_callback arc_write_callback_t;
501
502 struct arc_write_callback {
503 void *awcb_private;
504 arc_done_func_t *awcb_ready;
505 arc_done_func_t *awcb_done;
506 arc_buf_t *awcb_buf;
507 };
508
509 struct arc_buf_hdr {
510 /* protected by hash lock */
511 dva_t b_dva;
512 uint64_t b_birth;
513 uint64_t b_cksum0;
514
515 kmutex_t b_freeze_lock;
516 zio_cksum_t *b_freeze_cksum;
517
518 arc_buf_hdr_t *b_hash_next;
519 arc_buf_t *b_buf;
520 uint32_t b_flags;
521 uint32_t b_datacnt;
522
523 arc_callback_t *b_acb;
524 kcondvar_t b_cv;
525
526 /* immutable */
527 arc_buf_contents_t b_type;
528 uint64_t b_size;
529 uint64_t b_spa;
530
531 /* protected by arc state mutex */
532 arc_state_t *b_state;
533 list_node_t b_arc_node;
534
535 /* updated atomically */
536 clock_t b_arc_access;
537
538 /* self protecting */
539 refcount_t b_refcnt;
540
541 l2arc_buf_hdr_t *b_l2hdr;
542 list_node_t b_l2node;
543 };
544
545 static list_t arc_prune_list;
546 static kmutex_t arc_prune_mtx;
547 static arc_buf_t *arc_eviction_list;
548 static kmutex_t arc_eviction_mtx;
549 static arc_buf_hdr_t arc_eviction_hdr;
550 static void arc_get_data_buf(arc_buf_t *buf);
551 static void arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock);
552 static int arc_evict_needed(arc_buf_contents_t type);
553 static void arc_evict_ghost(arc_state_t *state, uint64_t spa, int64_t bytes);
554
555 static boolean_t l2arc_write_eligible(uint64_t spa_guid, arc_buf_hdr_t *ab);
556
557 #define GHOST_STATE(state) \
558 ((state) == arc_mru_ghost || (state) == arc_mfu_ghost || \
559 (state) == arc_l2c_only)
560
561 /*
562 * Private ARC flags. These flags are private ARC only flags that will show up
563 * in b_flags in the arc_hdr_buf_t. Some flags are publicly declared, and can
564 * be passed in as arc_flags in things like arc_read. However, these flags
565 * should never be passed and should only be set by ARC code. When adding new
566 * public flags, make sure not to smash the private ones.
567 */
568
569 #define ARC_IN_HASH_TABLE (1 << 9) /* this buffer is hashed */
570 #define ARC_IO_IN_PROGRESS (1 << 10) /* I/O in progress for buf */
571 #define ARC_IO_ERROR (1 << 11) /* I/O failed for buf */
572 #define ARC_FREED_IN_READ (1 << 12) /* buf freed while in read */
573 #define ARC_BUF_AVAILABLE (1 << 13) /* block not in active use */
574 #define ARC_INDIRECT (1 << 14) /* this is an indirect block */
575 #define ARC_FREE_IN_PROGRESS (1 << 15) /* hdr about to be freed */
576 #define ARC_L2_WRITING (1 << 16) /* L2ARC write in progress */
577 #define ARC_L2_EVICTED (1 << 17) /* evicted during I/O */
578 #define ARC_L2_WRITE_HEAD (1 << 18) /* head of write list */
579
580 #define HDR_IN_HASH_TABLE(hdr) ((hdr)->b_flags & ARC_IN_HASH_TABLE)
581 #define HDR_IO_IN_PROGRESS(hdr) ((hdr)->b_flags & ARC_IO_IN_PROGRESS)
582 #define HDR_IO_ERROR(hdr) ((hdr)->b_flags & ARC_IO_ERROR)
583 #define HDR_PREFETCH(hdr) ((hdr)->b_flags & ARC_PREFETCH)
584 #define HDR_FREED_IN_READ(hdr) ((hdr)->b_flags & ARC_FREED_IN_READ)
585 #define HDR_BUF_AVAILABLE(hdr) ((hdr)->b_flags & ARC_BUF_AVAILABLE)
586 #define HDR_FREE_IN_PROGRESS(hdr) ((hdr)->b_flags & ARC_FREE_IN_PROGRESS)
587 #define HDR_L2CACHE(hdr) ((hdr)->b_flags & ARC_L2CACHE)
588 #define HDR_L2_READING(hdr) ((hdr)->b_flags & ARC_IO_IN_PROGRESS && \
589 (hdr)->b_l2hdr != NULL)
590 #define HDR_L2_WRITING(hdr) ((hdr)->b_flags & ARC_L2_WRITING)
591 #define HDR_L2_EVICTED(hdr) ((hdr)->b_flags & ARC_L2_EVICTED)
592 #define HDR_L2_WRITE_HEAD(hdr) ((hdr)->b_flags & ARC_L2_WRITE_HEAD)
593
594 /*
595 * Other sizes
596 */
597
598 #define HDR_SIZE ((int64_t)sizeof (arc_buf_hdr_t))
599 #define L2HDR_SIZE ((int64_t)sizeof (l2arc_buf_hdr_t))
600
601 /*
602 * Hash table routines
603 */
604
605 #define HT_LOCK_ALIGN 64
606 #define HT_LOCK_PAD (P2NPHASE(sizeof (kmutex_t), (HT_LOCK_ALIGN)))
607
608 struct ht_lock {
609 kmutex_t ht_lock;
610 #ifdef _KERNEL
611 unsigned char pad[HT_LOCK_PAD];
612 #endif
613 };
614
615 #define BUF_LOCKS 256
616 typedef struct buf_hash_table {
617 uint64_t ht_mask;
618 arc_buf_hdr_t **ht_table;
619 struct ht_lock ht_locks[BUF_LOCKS];
620 } buf_hash_table_t;
621
622 static buf_hash_table_t buf_hash_table;
623
624 #define BUF_HASH_INDEX(spa, dva, birth) \
625 (buf_hash(spa, dva, birth) & buf_hash_table.ht_mask)
626 #define BUF_HASH_LOCK_NTRY(idx) (buf_hash_table.ht_locks[idx & (BUF_LOCKS-1)])
627 #define BUF_HASH_LOCK(idx) (&(BUF_HASH_LOCK_NTRY(idx).ht_lock))
628 #define HDR_LOCK(hdr) \
629 (BUF_HASH_LOCK(BUF_HASH_INDEX(hdr->b_spa, &hdr->b_dva, hdr->b_birth)))
630
631 uint64_t zfs_crc64_table[256];
632
633 /*
634 * Level 2 ARC
635 */
636
637 #define L2ARC_WRITE_SIZE (8 * 1024 * 1024) /* initial write max */
638 #define L2ARC_HEADROOM 2 /* num of writes */
639 /*
640 * If we discover during ARC scan any buffers to be compressed, we boost
641 * our headroom for the next scanning cycle by this percentage multiple.
642 */
643 #define L2ARC_HEADROOM_BOOST 200
644 #define L2ARC_FEED_SECS 1 /* caching interval secs */
645 #define L2ARC_FEED_MIN_MS 200 /* min caching interval ms */
646
647 #define l2arc_writes_sent ARCSTAT(arcstat_l2_writes_sent)
648 #define l2arc_writes_done ARCSTAT(arcstat_l2_writes_done)
649
650 /*
651 * L2ARC Performance Tunables
652 */
653 unsigned long l2arc_write_max = L2ARC_WRITE_SIZE; /* def max write size */
654 unsigned long l2arc_write_boost = L2ARC_WRITE_SIZE; /* extra warmup write */
655 unsigned long l2arc_headroom = L2ARC_HEADROOM; /* # of dev writes */
656 unsigned long l2arc_headroom_boost = L2ARC_HEADROOM_BOOST;
657 unsigned long l2arc_feed_secs = L2ARC_FEED_SECS; /* interval seconds */
658 unsigned long l2arc_feed_min_ms = L2ARC_FEED_MIN_MS; /* min interval msecs */
659 int l2arc_noprefetch = B_TRUE; /* don't cache prefetch bufs */
660 int l2arc_nocompress = B_FALSE; /* don't compress bufs */
661 int l2arc_feed_again = B_TRUE; /* turbo warmup */
662 int l2arc_norw = B_FALSE; /* no reads during writes */
663
664 /*
665 * L2ARC Internals
666 */
667 typedef struct l2arc_dev {
668 vdev_t *l2ad_vdev; /* vdev */
669 spa_t *l2ad_spa; /* spa */
670 uint64_t l2ad_hand; /* next write location */
671 uint64_t l2ad_start; /* first addr on device */
672 uint64_t l2ad_end; /* last addr on device */
673 uint64_t l2ad_evict; /* last addr eviction reached */
674 boolean_t l2ad_first; /* first sweep through */
675 boolean_t l2ad_writing; /* currently writing */
676 list_t *l2ad_buflist; /* buffer list */
677 list_node_t l2ad_node; /* device list node */
678 } l2arc_dev_t;
679
680 static list_t L2ARC_dev_list; /* device list */
681 static list_t *l2arc_dev_list; /* device list pointer */
682 static kmutex_t l2arc_dev_mtx; /* device list mutex */
683 static l2arc_dev_t *l2arc_dev_last; /* last device used */
684 static kmutex_t l2arc_buflist_mtx; /* mutex for all buflists */
685 static list_t L2ARC_free_on_write; /* free after write buf list */
686 static list_t *l2arc_free_on_write; /* free after write list ptr */
687 static kmutex_t l2arc_free_on_write_mtx; /* mutex for list */
688 static uint64_t l2arc_ndev; /* number of devices */
689
690 typedef struct l2arc_read_callback {
691 arc_buf_t *l2rcb_buf; /* read buffer */
692 spa_t *l2rcb_spa; /* spa */
693 blkptr_t l2rcb_bp; /* original blkptr */
694 zbookmark_t l2rcb_zb; /* original bookmark */
695 int l2rcb_flags; /* original flags */
696 enum zio_compress l2rcb_compress; /* applied compress */
697 } l2arc_read_callback_t;
698
699 typedef struct l2arc_write_callback {
700 l2arc_dev_t *l2wcb_dev; /* device info */
701 arc_buf_hdr_t *l2wcb_head; /* head of write buflist */
702 } l2arc_write_callback_t;
703
704 struct l2arc_buf_hdr {
705 /* protected by arc_buf_hdr mutex */
706 l2arc_dev_t *b_dev; /* L2ARC device */
707 uint64_t b_daddr; /* disk address, offset byte */
708 /* compression applied to buffer data */
709 enum zio_compress b_compress;
710 /* real alloc'd buffer size depending on b_compress applied */
711 int b_asize;
712 /* temporary buffer holder for in-flight compressed data */
713 void *b_tmp_cdata;
714 };
715
716 typedef struct l2arc_data_free {
717 /* protected by l2arc_free_on_write_mtx */
718 void *l2df_data;
719 size_t l2df_size;
720 void (*l2df_func)(void *, size_t);
721 list_node_t l2df_list_node;
722 } l2arc_data_free_t;
723
724 static kmutex_t l2arc_feed_thr_lock;
725 static kcondvar_t l2arc_feed_thr_cv;
726 static uint8_t l2arc_thread_exit;
727
728 static void l2arc_read_done(zio_t *zio);
729 static void l2arc_hdr_stat_add(void);
730 static void l2arc_hdr_stat_remove(void);
731
732 static boolean_t l2arc_compress_buf(l2arc_buf_hdr_t *l2hdr);
733 static void l2arc_decompress_zio(zio_t *zio, arc_buf_hdr_t *hdr,
734 enum zio_compress c);
735 static void l2arc_release_cdata_buf(arc_buf_hdr_t *ab);
736
737 static uint64_t
738 buf_hash(uint64_t spa, const dva_t *dva, uint64_t birth)
739 {
740 uint8_t *vdva = (uint8_t *)dva;
741 uint64_t crc = -1ULL;
742 int i;
743
744 ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY);
745
746 for (i = 0; i < sizeof (dva_t); i++)
747 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ vdva[i]) & 0xFF];
748
749 crc ^= (spa>>8) ^ birth;
750
751 return (crc);
752 }
753
754 #define BUF_EMPTY(buf) \
755 ((buf)->b_dva.dva_word[0] == 0 && \
756 (buf)->b_dva.dva_word[1] == 0 && \
757 (buf)->b_birth == 0)
758
759 #define BUF_EQUAL(spa, dva, birth, buf) \
760 ((buf)->b_dva.dva_word[0] == (dva)->dva_word[0]) && \
761 ((buf)->b_dva.dva_word[1] == (dva)->dva_word[1]) && \
762 ((buf)->b_birth == birth) && ((buf)->b_spa == spa)
763
764 static void
765 buf_discard_identity(arc_buf_hdr_t *hdr)
766 {
767 hdr->b_dva.dva_word[0] = 0;
768 hdr->b_dva.dva_word[1] = 0;
769 hdr->b_birth = 0;
770 hdr->b_cksum0 = 0;
771 }
772
773 static arc_buf_hdr_t *
774 buf_hash_find(uint64_t spa, const dva_t *dva, uint64_t birth, kmutex_t **lockp)
775 {
776 uint64_t idx = BUF_HASH_INDEX(spa, dva, birth);
777 kmutex_t *hash_lock = BUF_HASH_LOCK(idx);
778 arc_buf_hdr_t *buf;
779
780 mutex_enter(hash_lock);
781 for (buf = buf_hash_table.ht_table[idx]; buf != NULL;
782 buf = buf->b_hash_next) {
783 if (BUF_EQUAL(spa, dva, birth, buf)) {
784 *lockp = hash_lock;
785 return (buf);
786 }
787 }
788 mutex_exit(hash_lock);
789 *lockp = NULL;
790 return (NULL);
791 }
792
793 /*
794 * Insert an entry into the hash table. If there is already an element
795 * equal to elem in the hash table, then the already existing element
796 * will be returned and the new element will not be inserted.
797 * Otherwise returns NULL.
798 */
799 static arc_buf_hdr_t *
800 buf_hash_insert(arc_buf_hdr_t *buf, kmutex_t **lockp)
801 {
802 uint64_t idx = BUF_HASH_INDEX(buf->b_spa, &buf->b_dva, buf->b_birth);
803 kmutex_t *hash_lock = BUF_HASH_LOCK(idx);
804 arc_buf_hdr_t *fbuf;
805 uint32_t i;
806
807 ASSERT(!HDR_IN_HASH_TABLE(buf));
808 *lockp = hash_lock;
809 mutex_enter(hash_lock);
810 for (fbuf = buf_hash_table.ht_table[idx], i = 0; fbuf != NULL;
811 fbuf = fbuf->b_hash_next, i++) {
812 if (BUF_EQUAL(buf->b_spa, &buf->b_dva, buf->b_birth, fbuf))
813 return (fbuf);
814 }
815
816 buf->b_hash_next = buf_hash_table.ht_table[idx];
817 buf_hash_table.ht_table[idx] = buf;
818 buf->b_flags |= ARC_IN_HASH_TABLE;
819
820 /* collect some hash table performance data */
821 if (i > 0) {
822 ARCSTAT_BUMP(arcstat_hash_collisions);
823 if (i == 1)
824 ARCSTAT_BUMP(arcstat_hash_chains);
825
826 ARCSTAT_MAX(arcstat_hash_chain_max, i);
827 }
828
829 ARCSTAT_BUMP(arcstat_hash_elements);
830 ARCSTAT_MAXSTAT(arcstat_hash_elements);
831
832 return (NULL);
833 }
834
835 static void
836 buf_hash_remove(arc_buf_hdr_t *buf)
837 {
838 arc_buf_hdr_t *fbuf, **bufp;
839 uint64_t idx = BUF_HASH_INDEX(buf->b_spa, &buf->b_dva, buf->b_birth);
840
841 ASSERT(MUTEX_HELD(BUF_HASH_LOCK(idx)));
842 ASSERT(HDR_IN_HASH_TABLE(buf));
843
844 bufp = &buf_hash_table.ht_table[idx];
845 while ((fbuf = *bufp) != buf) {
846 ASSERT(fbuf != NULL);
847 bufp = &fbuf->b_hash_next;
848 }
849 *bufp = buf->b_hash_next;
850 buf->b_hash_next = NULL;
851 buf->b_flags &= ~ARC_IN_HASH_TABLE;
852
853 /* collect some hash table performance data */
854 ARCSTAT_BUMPDOWN(arcstat_hash_elements);
855
856 if (buf_hash_table.ht_table[idx] &&
857 buf_hash_table.ht_table[idx]->b_hash_next == NULL)
858 ARCSTAT_BUMPDOWN(arcstat_hash_chains);
859 }
860
861 /*
862 * Global data structures and functions for the buf kmem cache.
863 */
864 static kmem_cache_t *hdr_cache;
865 static kmem_cache_t *buf_cache;
866
867 static void
868 buf_fini(void)
869 {
870 int i;
871
872 #if defined(_KERNEL) && defined(HAVE_SPL)
873 /* Large allocations which do not require contiguous pages
874 * should be using vmem_free() in the linux kernel */
875 vmem_free(buf_hash_table.ht_table,
876 (buf_hash_table.ht_mask + 1) * sizeof (void *));
877 #else
878 kmem_free(buf_hash_table.ht_table,
879 (buf_hash_table.ht_mask + 1) * sizeof (void *));
880 #endif
881 for (i = 0; i < BUF_LOCKS; i++)
882 mutex_destroy(&buf_hash_table.ht_locks[i].ht_lock);
883 kmem_cache_destroy(hdr_cache);
884 kmem_cache_destroy(buf_cache);
885 }
886
887 /*
888 * Constructor callback - called when the cache is empty
889 * and a new buf is requested.
890 */
891 /* ARGSUSED */
892 static int
893 hdr_cons(void *vbuf, void *unused, int kmflag)
894 {
895 arc_buf_hdr_t *buf = vbuf;
896
897 bzero(buf, sizeof (arc_buf_hdr_t));
898 refcount_create(&buf->b_refcnt);
899 cv_init(&buf->b_cv, NULL, CV_DEFAULT, NULL);
900 mutex_init(&buf->b_freeze_lock, NULL, MUTEX_DEFAULT, NULL);
901 list_link_init(&buf->b_arc_node);
902 list_link_init(&buf->b_l2node);
903 arc_space_consume(sizeof (arc_buf_hdr_t), ARC_SPACE_HDRS);
904
905 return (0);
906 }
907
908 /* ARGSUSED */
909 static int
910 buf_cons(void *vbuf, void *unused, int kmflag)
911 {
912 arc_buf_t *buf = vbuf;
913
914 bzero(buf, sizeof (arc_buf_t));
915 mutex_init(&buf->b_evict_lock, NULL, MUTEX_DEFAULT, NULL);
916 arc_space_consume(sizeof (arc_buf_t), ARC_SPACE_HDRS);
917
918 return (0);
919 }
920
921 /*
922 * Destructor callback - called when a cached buf is
923 * no longer required.
924 */
925 /* ARGSUSED */
926 static void
927 hdr_dest(void *vbuf, void *unused)
928 {
929 arc_buf_hdr_t *buf = vbuf;
930
931 ASSERT(BUF_EMPTY(buf));
932 refcount_destroy(&buf->b_refcnt);
933 cv_destroy(&buf->b_cv);
934 mutex_destroy(&buf->b_freeze_lock);
935 arc_space_return(sizeof (arc_buf_hdr_t), ARC_SPACE_HDRS);
936 }
937
938 /* ARGSUSED */
939 static void
940 buf_dest(void *vbuf, void *unused)
941 {
942 arc_buf_t *buf = vbuf;
943
944 mutex_destroy(&buf->b_evict_lock);
945 arc_space_return(sizeof (arc_buf_t), ARC_SPACE_HDRS);
946 }
947
948 static void
949 buf_init(void)
950 {
951 uint64_t *ct;
952 uint64_t hsize = 1ULL << 12;
953 int i, j;
954
955 /*
956 * The hash table is big enough to fill all of physical memory
957 * with an average 64K block size. The table will take up
958 * totalmem*sizeof(void*)/64K (eg. 128KB/GB with 8-byte pointers).
959 */
960 while (hsize * 65536 < physmem * PAGESIZE)
961 hsize <<= 1;
962 retry:
963 buf_hash_table.ht_mask = hsize - 1;
964 #if defined(_KERNEL) && defined(HAVE_SPL)
965 /* Large allocations which do not require contiguous pages
966 * should be using vmem_alloc() in the linux kernel */
967 buf_hash_table.ht_table =
968 vmem_zalloc(hsize * sizeof (void*), KM_SLEEP);
969 #else
970 buf_hash_table.ht_table =
971 kmem_zalloc(hsize * sizeof (void*), KM_NOSLEEP);
972 #endif
973 if (buf_hash_table.ht_table == NULL) {
974 ASSERT(hsize > (1ULL << 8));
975 hsize >>= 1;
976 goto retry;
977 }
978
979 hdr_cache = kmem_cache_create("arc_buf_hdr_t", sizeof (arc_buf_hdr_t),
980 0, hdr_cons, hdr_dest, NULL, NULL, NULL, 0);
981 buf_cache = kmem_cache_create("arc_buf_t", sizeof (arc_buf_t),
982 0, buf_cons, buf_dest, NULL, NULL, NULL, 0);
983
984 for (i = 0; i < 256; i++)
985 for (ct = zfs_crc64_table + i, *ct = i, j = 8; j > 0; j--)
986 *ct = (*ct >> 1) ^ (-(*ct & 1) & ZFS_CRC64_POLY);
987
988 for (i = 0; i < BUF_LOCKS; i++) {
989 mutex_init(&buf_hash_table.ht_locks[i].ht_lock,
990 NULL, MUTEX_DEFAULT, NULL);
991 }
992 }
993
994 #define ARC_MINTIME (hz>>4) /* 62 ms */
995
996 static void
997 arc_cksum_verify(arc_buf_t *buf)
998 {
999 zio_cksum_t zc;
1000
1001 if (!(zfs_flags & ZFS_DEBUG_MODIFY))
1002 return;
1003
1004 mutex_enter(&buf->b_hdr->b_freeze_lock);
1005 if (buf->b_hdr->b_freeze_cksum == NULL ||
1006 (buf->b_hdr->b_flags & ARC_IO_ERROR)) {
1007 mutex_exit(&buf->b_hdr->b_freeze_lock);
1008 return;
1009 }
1010 fletcher_2_native(buf->b_data, buf->b_hdr->b_size, &zc);
1011 if (!ZIO_CHECKSUM_EQUAL(*buf->b_hdr->b_freeze_cksum, zc))
1012 panic("buffer modified while frozen!");
1013 mutex_exit(&buf->b_hdr->b_freeze_lock);
1014 }
1015
1016 static int
1017 arc_cksum_equal(arc_buf_t *buf)
1018 {
1019 zio_cksum_t zc;
1020 int equal;
1021
1022 mutex_enter(&buf->b_hdr->b_freeze_lock);
1023 fletcher_2_native(buf->b_data, buf->b_hdr->b_size, &zc);
1024 equal = ZIO_CHECKSUM_EQUAL(*buf->b_hdr->b_freeze_cksum, zc);
1025 mutex_exit(&buf->b_hdr->b_freeze_lock);
1026
1027 return (equal);
1028 }
1029
1030 static void
1031 arc_cksum_compute(arc_buf_t *buf, boolean_t force)
1032 {
1033 if (!force && !(zfs_flags & ZFS_DEBUG_MODIFY))
1034 return;
1035
1036 mutex_enter(&buf->b_hdr->b_freeze_lock);
1037 if (buf->b_hdr->b_freeze_cksum != NULL) {
1038 mutex_exit(&buf->b_hdr->b_freeze_lock);
1039 return;
1040 }
1041 buf->b_hdr->b_freeze_cksum = kmem_alloc(sizeof (zio_cksum_t),
1042 KM_PUSHPAGE);
1043 fletcher_2_native(buf->b_data, buf->b_hdr->b_size,
1044 buf->b_hdr->b_freeze_cksum);
1045 mutex_exit(&buf->b_hdr->b_freeze_lock);
1046 }
1047
1048 void
1049 arc_buf_thaw(arc_buf_t *buf)
1050 {
1051 if (zfs_flags & ZFS_DEBUG_MODIFY) {
1052 if (buf->b_hdr->b_state != arc_anon)
1053 panic("modifying non-anon buffer!");
1054 if (buf->b_hdr->b_flags & ARC_IO_IN_PROGRESS)
1055 panic("modifying buffer while i/o in progress!");
1056 arc_cksum_verify(buf);
1057 }
1058
1059 mutex_enter(&buf->b_hdr->b_freeze_lock);
1060 if (buf->b_hdr->b_freeze_cksum != NULL) {
1061 kmem_free(buf->b_hdr->b_freeze_cksum, sizeof (zio_cksum_t));
1062 buf->b_hdr->b_freeze_cksum = NULL;
1063 }
1064
1065 mutex_exit(&buf->b_hdr->b_freeze_lock);
1066 }
1067
1068 void
1069 arc_buf_freeze(arc_buf_t *buf)
1070 {
1071 kmutex_t *hash_lock;
1072
1073 if (!(zfs_flags & ZFS_DEBUG_MODIFY))
1074 return;
1075
1076 hash_lock = HDR_LOCK(buf->b_hdr);
1077 mutex_enter(hash_lock);
1078
1079 ASSERT(buf->b_hdr->b_freeze_cksum != NULL ||
1080 buf->b_hdr->b_state == arc_anon);
1081 arc_cksum_compute(buf, B_FALSE);
1082 mutex_exit(hash_lock);
1083 }
1084
1085 static void
1086 add_reference(arc_buf_hdr_t *ab, kmutex_t *hash_lock, void *tag)
1087 {
1088 ASSERT(MUTEX_HELD(hash_lock));
1089
1090 if ((refcount_add(&ab->b_refcnt, tag) == 1) &&
1091 (ab->b_state != arc_anon)) {
1092 uint64_t delta = ab->b_size * ab->b_datacnt;
1093 list_t *list = &ab->b_state->arcs_list[ab->b_type];
1094 uint64_t *size = &ab->b_state->arcs_lsize[ab->b_type];
1095
1096 ASSERT(!MUTEX_HELD(&ab->b_state->arcs_mtx));
1097 mutex_enter(&ab->b_state->arcs_mtx);
1098 ASSERT(list_link_active(&ab->b_arc_node));
1099 list_remove(list, ab);
1100 if (GHOST_STATE(ab->b_state)) {
1101 ASSERT0(ab->b_datacnt);
1102 ASSERT3P(ab->b_buf, ==, NULL);
1103 delta = ab->b_size;
1104 }
1105 ASSERT(delta > 0);
1106 ASSERT3U(*size, >=, delta);
1107 atomic_add_64(size, -delta);
1108 mutex_exit(&ab->b_state->arcs_mtx);
1109 /* remove the prefetch flag if we get a reference */
1110 if (ab->b_flags & ARC_PREFETCH)
1111 ab->b_flags &= ~ARC_PREFETCH;
1112 }
1113 }
1114
1115 static int
1116 remove_reference(arc_buf_hdr_t *ab, kmutex_t *hash_lock, void *tag)
1117 {
1118 int cnt;
1119 arc_state_t *state = ab->b_state;
1120
1121 ASSERT(state == arc_anon || MUTEX_HELD(hash_lock));
1122 ASSERT(!GHOST_STATE(state));
1123
1124 if (((cnt = refcount_remove(&ab->b_refcnt, tag)) == 0) &&
1125 (state != arc_anon)) {
1126 uint64_t *size = &state->arcs_lsize[ab->b_type];
1127
1128 ASSERT(!MUTEX_HELD(&state->arcs_mtx));
1129 mutex_enter(&state->arcs_mtx);
1130 ASSERT(!list_link_active(&ab->b_arc_node));
1131 list_insert_head(&state->arcs_list[ab->b_type], ab);
1132 ASSERT(ab->b_datacnt > 0);
1133 atomic_add_64(size, ab->b_size * ab->b_datacnt);
1134 mutex_exit(&state->arcs_mtx);
1135 }
1136 return (cnt);
1137 }
1138
1139 /*
1140 * Move the supplied buffer to the indicated state. The mutex
1141 * for the buffer must be held by the caller.
1142 */
1143 static void
1144 arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *ab, kmutex_t *hash_lock)
1145 {
1146 arc_state_t *old_state = ab->b_state;
1147 int64_t refcnt = refcount_count(&ab->b_refcnt);
1148 uint64_t from_delta, to_delta;
1149
1150 ASSERT(MUTEX_HELD(hash_lock));
1151 ASSERT(new_state != old_state);
1152 ASSERT(refcnt == 0 || ab->b_datacnt > 0);
1153 ASSERT(ab->b_datacnt == 0 || !GHOST_STATE(new_state));
1154 ASSERT(ab->b_datacnt <= 1 || old_state != arc_anon);
1155
1156 from_delta = to_delta = ab->b_datacnt * ab->b_size;
1157
1158 /*
1159 * If this buffer is evictable, transfer it from the
1160 * old state list to the new state list.
1161 */
1162 if (refcnt == 0) {
1163 if (old_state != arc_anon) {
1164 int use_mutex = !MUTEX_HELD(&old_state->arcs_mtx);
1165 uint64_t *size = &old_state->arcs_lsize[ab->b_type];
1166
1167 if (use_mutex)
1168 mutex_enter(&old_state->arcs_mtx);
1169
1170 ASSERT(list_link_active(&ab->b_arc_node));
1171 list_remove(&old_state->arcs_list[ab->b_type], ab);
1172
1173 /*
1174 * If prefetching out of the ghost cache,
1175 * we will have a non-zero datacnt.
1176 */
1177 if (GHOST_STATE(old_state) && ab->b_datacnt == 0) {
1178 /* ghost elements have a ghost size */
1179 ASSERT(ab->b_buf == NULL);
1180 from_delta = ab->b_size;
1181 }
1182 ASSERT3U(*size, >=, from_delta);
1183 atomic_add_64(size, -from_delta);
1184
1185 if (use_mutex)
1186 mutex_exit(&old_state->arcs_mtx);
1187 }
1188 if (new_state != arc_anon) {
1189 int use_mutex = !MUTEX_HELD(&new_state->arcs_mtx);
1190 uint64_t *size = &new_state->arcs_lsize[ab->b_type];
1191
1192 if (use_mutex)
1193 mutex_enter(&new_state->arcs_mtx);
1194
1195 list_insert_head(&new_state->arcs_list[ab->b_type], ab);
1196
1197 /* ghost elements have a ghost size */
1198 if (GHOST_STATE(new_state)) {
1199 ASSERT(ab->b_datacnt == 0);
1200 ASSERT(ab->b_buf == NULL);
1201 to_delta = ab->b_size;
1202 }
1203 atomic_add_64(size, to_delta);
1204
1205 if (use_mutex)
1206 mutex_exit(&new_state->arcs_mtx);
1207 }
1208 }
1209
1210 ASSERT(!BUF_EMPTY(ab));
1211 if (new_state == arc_anon && HDR_IN_HASH_TABLE(ab))
1212 buf_hash_remove(ab);
1213
1214 /* adjust state sizes */
1215 if (to_delta)
1216 atomic_add_64(&new_state->arcs_size, to_delta);
1217 if (from_delta) {
1218 ASSERT3U(old_state->arcs_size, >=, from_delta);
1219 atomic_add_64(&old_state->arcs_size, -from_delta);
1220 }
1221 ab->b_state = new_state;
1222
1223 /* adjust l2arc hdr stats */
1224 if (new_state == arc_l2c_only)
1225 l2arc_hdr_stat_add();
1226 else if (old_state == arc_l2c_only)
1227 l2arc_hdr_stat_remove();
1228 }
1229
1230 void
1231 arc_space_consume(uint64_t space, arc_space_type_t type)
1232 {
1233 ASSERT(type >= 0 && type < ARC_SPACE_NUMTYPES);
1234
1235 switch (type) {
1236 default:
1237 break;
1238 case ARC_SPACE_DATA:
1239 ARCSTAT_INCR(arcstat_data_size, space);
1240 break;
1241 case ARC_SPACE_OTHER:
1242 ARCSTAT_INCR(arcstat_other_size, space);
1243 break;
1244 case ARC_SPACE_HDRS:
1245 ARCSTAT_INCR(arcstat_hdr_size, space);
1246 break;
1247 case ARC_SPACE_L2HDRS:
1248 ARCSTAT_INCR(arcstat_l2_hdr_size, space);
1249 break;
1250 }
1251
1252 atomic_add_64(&arc_meta_used, space);
1253 atomic_add_64(&arc_size, space);
1254 }
1255
1256 void
1257 arc_space_return(uint64_t space, arc_space_type_t type)
1258 {
1259 ASSERT(type >= 0 && type < ARC_SPACE_NUMTYPES);
1260
1261 switch (type) {
1262 default:
1263 break;
1264 case ARC_SPACE_DATA:
1265 ARCSTAT_INCR(arcstat_data_size, -space);
1266 break;
1267 case ARC_SPACE_OTHER:
1268 ARCSTAT_INCR(arcstat_other_size, -space);
1269 break;
1270 case ARC_SPACE_HDRS:
1271 ARCSTAT_INCR(arcstat_hdr_size, -space);
1272 break;
1273 case ARC_SPACE_L2HDRS:
1274 ARCSTAT_INCR(arcstat_l2_hdr_size, -space);
1275 break;
1276 }
1277
1278 ASSERT(arc_meta_used >= space);
1279 if (arc_meta_max < arc_meta_used)
1280 arc_meta_max = arc_meta_used;
1281 atomic_add_64(&arc_meta_used, -space);
1282 ASSERT(arc_size >= space);
1283 atomic_add_64(&arc_size, -space);
1284 }
1285
1286 arc_buf_t *
1287 arc_buf_alloc(spa_t *spa, int size, void *tag, arc_buf_contents_t type)
1288 {
1289 arc_buf_hdr_t *hdr;
1290 arc_buf_t *buf;
1291
1292 ASSERT3U(size, >, 0);
1293 hdr = kmem_cache_alloc(hdr_cache, KM_PUSHPAGE);
1294 ASSERT(BUF_EMPTY(hdr));
1295 hdr->b_size = size;
1296 hdr->b_type = type;
1297 hdr->b_spa = spa_load_guid(spa);
1298 hdr->b_state = arc_anon;
1299 hdr->b_arc_access = 0;
1300 buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE);
1301 buf->b_hdr = hdr;
1302 buf->b_data = NULL;
1303 buf->b_efunc = NULL;
1304 buf->b_private = NULL;
1305 buf->b_next = NULL;
1306 hdr->b_buf = buf;
1307 arc_get_data_buf(buf);
1308 hdr->b_datacnt = 1;
1309 hdr->b_flags = 0;
1310 ASSERT(refcount_is_zero(&hdr->b_refcnt));
1311 (void) refcount_add(&hdr->b_refcnt, tag);
1312
1313 return (buf);
1314 }
1315
1316 static char *arc_onloan_tag = "onloan";
1317
1318 /*
1319 * Loan out an anonymous arc buffer. Loaned buffers are not counted as in
1320 * flight data by arc_tempreserve_space() until they are "returned". Loaned
1321 * buffers must be returned to the arc before they can be used by the DMU or
1322 * freed.
1323 */
1324 arc_buf_t *
1325 arc_loan_buf(spa_t *spa, int size)
1326 {
1327 arc_buf_t *buf;
1328
1329 buf = arc_buf_alloc(spa, size, arc_onloan_tag, ARC_BUFC_DATA);
1330
1331 atomic_add_64(&arc_loaned_bytes, size);
1332 return (buf);
1333 }
1334
1335 /*
1336 * Return a loaned arc buffer to the arc.
1337 */
1338 void
1339 arc_return_buf(arc_buf_t *buf, void *tag)
1340 {
1341 arc_buf_hdr_t *hdr = buf->b_hdr;
1342
1343 ASSERT(buf->b_data != NULL);
1344 (void) refcount_add(&hdr->b_refcnt, tag);
1345 (void) refcount_remove(&hdr->b_refcnt, arc_onloan_tag);
1346
1347 atomic_add_64(&arc_loaned_bytes, -hdr->b_size);
1348 }
1349
1350 /* Detach an arc_buf from a dbuf (tag) */
1351 void
1352 arc_loan_inuse_buf(arc_buf_t *buf, void *tag)
1353 {
1354 arc_buf_hdr_t *hdr;
1355
1356 ASSERT(buf->b_data != NULL);
1357 hdr = buf->b_hdr;
1358 (void) refcount_add(&hdr->b_refcnt, arc_onloan_tag);
1359 (void) refcount_remove(&hdr->b_refcnt, tag);
1360 buf->b_efunc = NULL;
1361 buf->b_private = NULL;
1362
1363 atomic_add_64(&arc_loaned_bytes, hdr->b_size);
1364 }
1365
1366 static arc_buf_t *
1367 arc_buf_clone(arc_buf_t *from)
1368 {
1369 arc_buf_t *buf;
1370 arc_buf_hdr_t *hdr = from->b_hdr;
1371 uint64_t size = hdr->b_size;
1372
1373 ASSERT(hdr->b_state != arc_anon);
1374
1375 buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE);
1376 buf->b_hdr = hdr;
1377 buf->b_data = NULL;
1378 buf->b_efunc = NULL;
1379 buf->b_private = NULL;
1380 buf->b_next = hdr->b_buf;
1381 hdr->b_buf = buf;
1382 arc_get_data_buf(buf);
1383 bcopy(from->b_data, buf->b_data, size);
1384
1385 /*
1386 * This buffer already exists in the arc so create a duplicate
1387 * copy for the caller. If the buffer is associated with user data
1388 * then track the size and number of duplicates. These stats will be
1389 * updated as duplicate buffers are created and destroyed.
1390 */
1391 if (hdr->b_type == ARC_BUFC_DATA) {
1392 ARCSTAT_BUMP(arcstat_duplicate_buffers);
1393 ARCSTAT_INCR(arcstat_duplicate_buffers_size, size);
1394 }
1395 hdr->b_datacnt += 1;
1396 return (buf);
1397 }
1398
1399 void
1400 arc_buf_add_ref(arc_buf_t *buf, void* tag)
1401 {
1402 arc_buf_hdr_t *hdr;
1403 kmutex_t *hash_lock;
1404
1405 /*
1406 * Check to see if this buffer is evicted. Callers
1407 * must verify b_data != NULL to know if the add_ref
1408 * was successful.
1409 */
1410 mutex_enter(&buf->b_evict_lock);
1411 if (buf->b_data == NULL) {
1412 mutex_exit(&buf->b_evict_lock);
1413 return;
1414 }
1415 hash_lock = HDR_LOCK(buf->b_hdr);
1416 mutex_enter(hash_lock);
1417 hdr = buf->b_hdr;
1418 ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
1419 mutex_exit(&buf->b_evict_lock);
1420
1421 ASSERT(hdr->b_state == arc_mru || hdr->b_state == arc_mfu);
1422 add_reference(hdr, hash_lock, tag);
1423 DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr);
1424 arc_access(hdr, hash_lock);
1425 mutex_exit(hash_lock);
1426 ARCSTAT_BUMP(arcstat_hits);
1427 ARCSTAT_CONDSTAT(!(hdr->b_flags & ARC_PREFETCH),
1428 demand, prefetch, hdr->b_type != ARC_BUFC_METADATA,
1429 data, metadata, hits);
1430 }
1431
1432 /*
1433 * Free the arc data buffer. If it is an l2arc write in progress,
1434 * the buffer is placed on l2arc_free_on_write to be freed later.
1435 */
1436 static void
1437 arc_buf_data_free(arc_buf_hdr_t *hdr, void (*free_func)(void *, size_t),
1438 void *data, size_t size)
1439 {
1440 if (HDR_L2_WRITING(hdr)) {
1441 l2arc_data_free_t *df;
1442 df = kmem_alloc(sizeof (l2arc_data_free_t), KM_PUSHPAGE);
1443 df->l2df_data = data;
1444 df->l2df_size = size;
1445 df->l2df_func = free_func;
1446 mutex_enter(&l2arc_free_on_write_mtx);
1447 list_insert_head(l2arc_free_on_write, df);
1448 mutex_exit(&l2arc_free_on_write_mtx);
1449 ARCSTAT_BUMP(arcstat_l2_free_on_write);
1450 } else {
1451 free_func(data, size);
1452 }
1453 }
1454
1455 static void
1456 arc_buf_destroy(arc_buf_t *buf, boolean_t recycle, boolean_t all)
1457 {
1458 arc_buf_t **bufp;
1459
1460 /* free up data associated with the buf */
1461 if (buf->b_data) {
1462 arc_state_t *state = buf->b_hdr->b_state;
1463 uint64_t size = buf->b_hdr->b_size;
1464 arc_buf_contents_t type = buf->b_hdr->b_type;
1465
1466 arc_cksum_verify(buf);
1467
1468 if (!recycle) {
1469 if (type == ARC_BUFC_METADATA) {
1470 arc_buf_data_free(buf->b_hdr, zio_buf_free,
1471 buf->b_data, size);
1472 arc_space_return(size, ARC_SPACE_DATA);
1473 } else {
1474 ASSERT(type == ARC_BUFC_DATA);
1475 arc_buf_data_free(buf->b_hdr,
1476 zio_data_buf_free, buf->b_data, size);
1477 ARCSTAT_INCR(arcstat_data_size, -size);
1478 atomic_add_64(&arc_size, -size);
1479 }
1480 }
1481 if (list_link_active(&buf->b_hdr->b_arc_node)) {
1482 uint64_t *cnt = &state->arcs_lsize[type];
1483
1484 ASSERT(refcount_is_zero(&buf->b_hdr->b_refcnt));
1485 ASSERT(state != arc_anon);
1486
1487 ASSERT3U(*cnt, >=, size);
1488 atomic_add_64(cnt, -size);
1489 }
1490 ASSERT3U(state->arcs_size, >=, size);
1491 atomic_add_64(&state->arcs_size, -size);
1492 buf->b_data = NULL;
1493
1494 /*
1495 * If we're destroying a duplicate buffer make sure
1496 * that the appropriate statistics are updated.
1497 */
1498 if (buf->b_hdr->b_datacnt > 1 &&
1499 buf->b_hdr->b_type == ARC_BUFC_DATA) {
1500 ARCSTAT_BUMPDOWN(arcstat_duplicate_buffers);
1501 ARCSTAT_INCR(arcstat_duplicate_buffers_size, -size);
1502 }
1503 ASSERT(buf->b_hdr->b_datacnt > 0);
1504 buf->b_hdr->b_datacnt -= 1;
1505 }
1506
1507 /* only remove the buf if requested */
1508 if (!all)
1509 return;
1510
1511 /* remove the buf from the hdr list */
1512 for (bufp = &buf->b_hdr->b_buf; *bufp != buf; bufp = &(*bufp)->b_next)
1513 continue;
1514 *bufp = buf->b_next;
1515 buf->b_next = NULL;
1516
1517 ASSERT(buf->b_efunc == NULL);
1518
1519 /* clean up the buf */
1520 buf->b_hdr = NULL;
1521 kmem_cache_free(buf_cache, buf);
1522 }
1523
1524 static void
1525 arc_hdr_destroy(arc_buf_hdr_t *hdr)
1526 {
1527 l2arc_buf_hdr_t *l2hdr = hdr->b_l2hdr;
1528
1529 ASSERT(refcount_is_zero(&hdr->b_refcnt));
1530 ASSERT3P(hdr->b_state, ==, arc_anon);
1531 ASSERT(!HDR_IO_IN_PROGRESS(hdr));
1532
1533 if (l2hdr != NULL) {
1534 boolean_t buflist_held = MUTEX_HELD(&l2arc_buflist_mtx);
1535 /*
1536 * To prevent arc_free() and l2arc_evict() from
1537 * attempting to free the same buffer at the same time,
1538 * a FREE_IN_PROGRESS flag is given to arc_free() to
1539 * give it priority. l2arc_evict() can't destroy this
1540 * header while we are waiting on l2arc_buflist_mtx.
1541 *
1542 * The hdr may be removed from l2ad_buflist before we
1543 * grab l2arc_buflist_mtx, so b_l2hdr is rechecked.
1544 */
1545 if (!buflist_held) {
1546 mutex_enter(&l2arc_buflist_mtx);
1547 l2hdr = hdr->b_l2hdr;
1548 }
1549
1550 if (l2hdr != NULL) {
1551 list_remove(l2hdr->b_dev->l2ad_buflist, hdr);
1552 ARCSTAT_INCR(arcstat_l2_size, -hdr->b_size);
1553 ARCSTAT_INCR(arcstat_l2_asize, -l2hdr->b_asize);
1554 kmem_free(l2hdr, sizeof (l2arc_buf_hdr_t));
1555 arc_space_return(L2HDR_SIZE, ARC_SPACE_L2HDRS);
1556 if (hdr->b_state == arc_l2c_only)
1557 l2arc_hdr_stat_remove();
1558 hdr->b_l2hdr = NULL;
1559 }
1560
1561 if (!buflist_held)
1562 mutex_exit(&l2arc_buflist_mtx);
1563 }
1564
1565 if (!BUF_EMPTY(hdr)) {
1566 ASSERT(!HDR_IN_HASH_TABLE(hdr));
1567 buf_discard_identity(hdr);
1568 }
1569 while (hdr->b_buf) {
1570 arc_buf_t *buf = hdr->b_buf;
1571
1572 if (buf->b_efunc) {
1573 mutex_enter(&arc_eviction_mtx);
1574 mutex_enter(&buf->b_evict_lock);
1575 ASSERT(buf->b_hdr != NULL);
1576 arc_buf_destroy(hdr->b_buf, FALSE, FALSE);
1577 hdr->b_buf = buf->b_next;
1578 buf->b_hdr = &arc_eviction_hdr;
1579 buf->b_next = arc_eviction_list;
1580 arc_eviction_list = buf;
1581 mutex_exit(&buf->b_evict_lock);
1582 mutex_exit(&arc_eviction_mtx);
1583 } else {
1584 arc_buf_destroy(hdr->b_buf, FALSE, TRUE);
1585 }
1586 }
1587 if (hdr->b_freeze_cksum != NULL) {
1588 kmem_free(hdr->b_freeze_cksum, sizeof (zio_cksum_t));
1589 hdr->b_freeze_cksum = NULL;
1590 }
1591
1592 ASSERT(!list_link_active(&hdr->b_arc_node));
1593 ASSERT3P(hdr->b_hash_next, ==, NULL);
1594 ASSERT3P(hdr->b_acb, ==, NULL);
1595 kmem_cache_free(hdr_cache, hdr);
1596 }
1597
1598 void
1599 arc_buf_free(arc_buf_t *buf, void *tag)
1600 {
1601 arc_buf_hdr_t *hdr = buf->b_hdr;
1602 int hashed = hdr->b_state != arc_anon;
1603
1604 ASSERT(buf->b_efunc == NULL);
1605 ASSERT(buf->b_data != NULL);
1606
1607 if (hashed) {
1608 kmutex_t *hash_lock = HDR_LOCK(hdr);
1609
1610 mutex_enter(hash_lock);
1611 hdr = buf->b_hdr;
1612 ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
1613
1614 (void) remove_reference(hdr, hash_lock, tag);
1615 if (hdr->b_datacnt > 1) {
1616 arc_buf_destroy(buf, FALSE, TRUE);
1617 } else {
1618 ASSERT(buf == hdr->b_buf);
1619 ASSERT(buf->b_efunc == NULL);
1620 hdr->b_flags |= ARC_BUF_AVAILABLE;
1621 }
1622 mutex_exit(hash_lock);
1623 } else if (HDR_IO_IN_PROGRESS(hdr)) {
1624 int destroy_hdr;
1625 /*
1626 * We are in the middle of an async write. Don't destroy
1627 * this buffer unless the write completes before we finish
1628 * decrementing the reference count.
1629 */
1630 mutex_enter(&arc_eviction_mtx);
1631 (void) remove_reference(hdr, NULL, tag);
1632 ASSERT(refcount_is_zero(&hdr->b_refcnt));
1633 destroy_hdr = !HDR_IO_IN_PROGRESS(hdr);
1634 mutex_exit(&arc_eviction_mtx);
1635 if (destroy_hdr)
1636 arc_hdr_destroy(hdr);
1637 } else {
1638 if (remove_reference(hdr, NULL, tag) > 0)
1639 arc_buf_destroy(buf, FALSE, TRUE);
1640 else
1641 arc_hdr_destroy(hdr);
1642 }
1643 }
1644
1645 int
1646 arc_buf_remove_ref(arc_buf_t *buf, void* tag)
1647 {
1648 arc_buf_hdr_t *hdr = buf->b_hdr;
1649 kmutex_t *hash_lock = NULL;
1650 int no_callback = (buf->b_efunc == NULL);
1651
1652 if (hdr->b_state == arc_anon) {
1653 ASSERT(hdr->b_datacnt == 1);
1654 arc_buf_free(buf, tag);
1655 return (no_callback);
1656 }
1657
1658 hash_lock = HDR_LOCK(hdr);
1659 mutex_enter(hash_lock);
1660 hdr = buf->b_hdr;
1661 ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
1662 ASSERT(hdr->b_state != arc_anon);
1663 ASSERT(buf->b_data != NULL);
1664
1665 (void) remove_reference(hdr, hash_lock, tag);
1666 if (hdr->b_datacnt > 1) {
1667 if (no_callback)
1668 arc_buf_destroy(buf, FALSE, TRUE);
1669 } else if (no_callback) {
1670 ASSERT(hdr->b_buf == buf && buf->b_next == NULL);
1671 ASSERT(buf->b_efunc == NULL);
1672 hdr->b_flags |= ARC_BUF_AVAILABLE;
1673 }
1674 ASSERT(no_callback || hdr->b_datacnt > 1 ||
1675 refcount_is_zero(&hdr->b_refcnt));
1676 mutex_exit(hash_lock);
1677 return (no_callback);
1678 }
1679
1680 int
1681 arc_buf_size(arc_buf_t *buf)
1682 {
1683 return (buf->b_hdr->b_size);
1684 }
1685
1686 /*
1687 * Called from the DMU to determine if the current buffer should be
1688 * evicted. In order to ensure proper locking, the eviction must be initiated
1689 * from the DMU. Return true if the buffer is associated with user data and
1690 * duplicate buffers still exist.
1691 */
1692 boolean_t
1693 arc_buf_eviction_needed(arc_buf_t *buf)
1694 {
1695 arc_buf_hdr_t *hdr;
1696 boolean_t evict_needed = B_FALSE;
1697
1698 if (zfs_disable_dup_eviction)
1699 return (B_FALSE);
1700
1701 mutex_enter(&buf->b_evict_lock);
1702 hdr = buf->b_hdr;
1703 if (hdr == NULL) {
1704 /*
1705 * We are in arc_do_user_evicts(); let that function
1706 * perform the eviction.
1707 */
1708 ASSERT(buf->b_data == NULL);
1709 mutex_exit(&buf->b_evict_lock);
1710 return (B_FALSE);
1711 } else if (buf->b_data == NULL) {
1712 /*
1713 * We have already been added to the arc eviction list;
1714 * recommend eviction.
1715 */
1716 ASSERT3P(hdr, ==, &arc_eviction_hdr);
1717 mutex_exit(&buf->b_evict_lock);
1718 return (B_TRUE);
1719 }
1720
1721 if (hdr->b_datacnt > 1 && hdr->b_type == ARC_BUFC_DATA)
1722 evict_needed = B_TRUE;
1723
1724 mutex_exit(&buf->b_evict_lock);
1725 return (evict_needed);
1726 }
1727
1728 /*
1729 * Evict buffers from list until we've removed the specified number of
1730 * bytes. Move the removed buffers to the appropriate evict state.
1731 * If the recycle flag is set, then attempt to "recycle" a buffer:
1732 * - look for a buffer to evict that is `bytes' long.
1733 * - return the data block from this buffer rather than freeing it.
1734 * This flag is used by callers that are trying to make space for a
1735 * new buffer in a full arc cache.
1736 *
1737 * This function makes a "best effort". It skips over any buffers
1738 * it can't get a hash_lock on, and so may not catch all candidates.
1739 * It may also return without evicting as much space as requested.
1740 */
1741 static void *
1742 arc_evict(arc_state_t *state, uint64_t spa, int64_t bytes, boolean_t recycle,
1743 arc_buf_contents_t type)
1744 {
1745 arc_state_t *evicted_state;
1746 uint64_t bytes_evicted = 0, skipped = 0, missed = 0;
1747 arc_buf_hdr_t *ab, *ab_prev = NULL;
1748 list_t *list = &state->arcs_list[type];
1749 kmutex_t *hash_lock;
1750 boolean_t have_lock;
1751 void *stolen = NULL;
1752
1753 ASSERT(state == arc_mru || state == arc_mfu);
1754
1755 evicted_state = (state == arc_mru) ? arc_mru_ghost : arc_mfu_ghost;
1756
1757 mutex_enter(&state->arcs_mtx);
1758 mutex_enter(&evicted_state->arcs_mtx);
1759
1760 for (ab = list_tail(list); ab; ab = ab_prev) {
1761 ab_prev = list_prev(list, ab);
1762 /* prefetch buffers have a minimum lifespan */
1763 if (HDR_IO_IN_PROGRESS(ab) ||
1764 (spa && ab->b_spa != spa) ||
1765 (ab->b_flags & (ARC_PREFETCH|ARC_INDIRECT) &&
1766 ddi_get_lbolt() - ab->b_arc_access <
1767 zfs_arc_min_prefetch_lifespan)) {
1768 skipped++;
1769 continue;
1770 }
1771 /* "lookahead" for better eviction candidate */
1772 if (recycle && ab->b_size != bytes &&
1773 ab_prev && ab_prev->b_size == bytes)
1774 continue;
1775 hash_lock = HDR_LOCK(ab);
1776 have_lock = MUTEX_HELD(hash_lock);
1777 if (have_lock || mutex_tryenter(hash_lock)) {
1778 ASSERT0(refcount_count(&ab->b_refcnt));
1779 ASSERT(ab->b_datacnt > 0);
1780 while (ab->b_buf) {
1781 arc_buf_t *buf = ab->b_buf;
1782 if (!mutex_tryenter(&buf->b_evict_lock)) {
1783 missed += 1;
1784 break;
1785 }
1786 if (buf->b_data) {
1787 bytes_evicted += ab->b_size;
1788 if (recycle && ab->b_type == type &&
1789 ab->b_size == bytes &&
1790 !HDR_L2_WRITING(ab)) {
1791 stolen = buf->b_data;
1792 recycle = FALSE;
1793 }
1794 }
1795 if (buf->b_efunc) {
1796 mutex_enter(&arc_eviction_mtx);
1797 arc_buf_destroy(buf,
1798 buf->b_data == stolen, FALSE);
1799 ab->b_buf = buf->b_next;
1800 buf->b_hdr = &arc_eviction_hdr;
1801 buf->b_next = arc_eviction_list;
1802 arc_eviction_list = buf;
1803 mutex_exit(&arc_eviction_mtx);
1804 mutex_exit(&buf->b_evict_lock);
1805 } else {
1806 mutex_exit(&buf->b_evict_lock);
1807 arc_buf_destroy(buf,
1808 buf->b_data == stolen, TRUE);
1809 }
1810 }
1811
1812 if (ab->b_l2hdr) {
1813 ARCSTAT_INCR(arcstat_evict_l2_cached,
1814 ab->b_size);
1815 } else {
1816 if (l2arc_write_eligible(ab->b_spa, ab)) {
1817 ARCSTAT_INCR(arcstat_evict_l2_eligible,
1818 ab->b_size);
1819 } else {
1820 ARCSTAT_INCR(
1821 arcstat_evict_l2_ineligible,
1822 ab->b_size);
1823 }
1824 }
1825
1826 if (ab->b_datacnt == 0) {
1827 arc_change_state(evicted_state, ab, hash_lock);
1828 ASSERT(HDR_IN_HASH_TABLE(ab));
1829 ab->b_flags |= ARC_IN_HASH_TABLE;
1830 ab->b_flags &= ~ARC_BUF_AVAILABLE;
1831 DTRACE_PROBE1(arc__evict, arc_buf_hdr_t *, ab);
1832 }
1833 if (!have_lock)
1834 mutex_exit(hash_lock);
1835 if (bytes >= 0 && bytes_evicted >= bytes)
1836 break;
1837 } else {
1838 missed += 1;
1839 }
1840 }
1841
1842 mutex_exit(&evicted_state->arcs_mtx);
1843 mutex_exit(&state->arcs_mtx);
1844
1845 if (bytes_evicted < bytes)
1846 dprintf("only evicted %lld bytes from %x\n",
1847 (longlong_t)bytes_evicted, state);
1848
1849 if (skipped)
1850 ARCSTAT_INCR(arcstat_evict_skip, skipped);
1851
1852 if (missed)
1853 ARCSTAT_INCR(arcstat_mutex_miss, missed);
1854
1855 /*
1856 * We have just evicted some date into the ghost state, make
1857 * sure we also adjust the ghost state size if necessary.
1858 */
1859 if (arc_no_grow &&
1860 arc_mru_ghost->arcs_size + arc_mfu_ghost->arcs_size > arc_c) {
1861 int64_t mru_over = arc_anon->arcs_size + arc_mru->arcs_size +
1862 arc_mru_ghost->arcs_size - arc_c;
1863
1864 if (mru_over > 0 && arc_mru_ghost->arcs_lsize[type] > 0) {
1865 int64_t todelete =
1866 MIN(arc_mru_ghost->arcs_lsize[type], mru_over);
1867 arc_evict_ghost(arc_mru_ghost, 0, todelete);
1868 } else if (arc_mfu_ghost->arcs_lsize[type] > 0) {
1869 int64_t todelete = MIN(arc_mfu_ghost->arcs_lsize[type],
1870 arc_mru_ghost->arcs_size +
1871 arc_mfu_ghost->arcs_size - arc_c);
1872 arc_evict_ghost(arc_mfu_ghost, 0, todelete);
1873 }
1874 }
1875
1876 return (stolen);
1877 }
1878
1879 /*
1880 * Remove buffers from list until we've removed the specified number of
1881 * bytes. Destroy the buffers that are removed.
1882 */
1883 static void
1884 arc_evict_ghost(arc_state_t *state, uint64_t spa, int64_t bytes)
1885 {
1886 arc_buf_hdr_t *ab, *ab_prev;
1887 arc_buf_hdr_t marker;
1888 list_t *list = &state->arcs_list[ARC_BUFC_DATA];
1889 kmutex_t *hash_lock;
1890 uint64_t bytes_deleted = 0;
1891 uint64_t bufs_skipped = 0;
1892
1893 ASSERT(GHOST_STATE(state));
1894 bzero(&marker, sizeof(marker));
1895 top:
1896 mutex_enter(&state->arcs_mtx);
1897 for (ab = list_tail(list); ab; ab = ab_prev) {
1898 ab_prev = list_prev(list, ab);
1899 if (spa && ab->b_spa != spa)
1900 continue;
1901
1902 /* ignore markers */
1903 if (ab->b_spa == 0)
1904 continue;
1905
1906 hash_lock = HDR_LOCK(ab);
1907 /* caller may be trying to modify this buffer, skip it */
1908 if (MUTEX_HELD(hash_lock))
1909 continue;
1910 if (mutex_tryenter(hash_lock)) {
1911 ASSERT(!HDR_IO_IN_PROGRESS(ab));
1912 ASSERT(ab->b_buf == NULL);
1913 ARCSTAT_BUMP(arcstat_deleted);
1914 bytes_deleted += ab->b_size;
1915
1916 if (ab->b_l2hdr != NULL) {
1917 /*
1918 * This buffer is cached on the 2nd Level ARC;
1919 * don't destroy the header.
1920 */
1921 arc_change_state(arc_l2c_only, ab, hash_lock);
1922 mutex_exit(hash_lock);
1923 } else {
1924 arc_change_state(arc_anon, ab, hash_lock);
1925 mutex_exit(hash_lock);
1926 arc_hdr_destroy(ab);
1927 }
1928
1929 DTRACE_PROBE1(arc__delete, arc_buf_hdr_t *, ab);
1930 if (bytes >= 0 && bytes_deleted >= bytes)
1931 break;
1932 } else if (bytes < 0) {
1933 /*
1934 * Insert a list marker and then wait for the
1935 * hash lock to become available. Once its
1936 * available, restart from where we left off.
1937 */
1938 list_insert_after(list, ab, &marker);
1939 mutex_exit(&state->arcs_mtx);
1940 mutex_enter(hash_lock);
1941 mutex_exit(hash_lock);
1942 mutex_enter(&state->arcs_mtx);
1943 ab_prev = list_prev(list, &marker);
1944 list_remove(list, &marker);
1945 } else
1946 bufs_skipped += 1;
1947 }
1948 mutex_exit(&state->arcs_mtx);
1949
1950 if (list == &state->arcs_list[ARC_BUFC_DATA] &&
1951 (bytes < 0 || bytes_deleted < bytes)) {
1952 list = &state->arcs_list[ARC_BUFC_METADATA];
1953 goto top;
1954 }
1955
1956 if (bufs_skipped) {
1957 ARCSTAT_INCR(arcstat_mutex_miss, bufs_skipped);
1958 ASSERT(bytes >= 0);
1959 }
1960
1961 if (bytes_deleted < bytes)
1962 dprintf("only deleted %lld bytes from %p\n",
1963 (longlong_t)bytes_deleted, state);
1964 }
1965
1966 static void
1967 arc_adjust(void)
1968 {
1969 int64_t adjustment, delta;
1970
1971 /*
1972 * Adjust MRU size
1973 */
1974
1975 adjustment = MIN((int64_t)(arc_size - arc_c),
1976 (int64_t)(arc_anon->arcs_size + arc_mru->arcs_size + arc_meta_used -
1977 arc_p));
1978
1979 if (adjustment > 0 && arc_mru->arcs_lsize[ARC_BUFC_DATA] > 0) {
1980 delta = MIN(arc_mru->arcs_lsize[ARC_BUFC_DATA], adjustment);
1981 (void) arc_evict(arc_mru, 0, delta, FALSE, ARC_BUFC_DATA);
1982 adjustment -= delta;
1983 }
1984
1985 if (adjustment > 0 && arc_mru->arcs_lsize[ARC_BUFC_METADATA] > 0) {
1986 delta = MIN(arc_mru->arcs_lsize[ARC_BUFC_METADATA], adjustment);
1987 (void) arc_evict(arc_mru, 0, delta, FALSE,
1988 ARC_BUFC_METADATA);
1989 }
1990
1991 /*
1992 * Adjust MFU size
1993 */
1994
1995 adjustment = arc_size - arc_c;
1996
1997 if (adjustment > 0 && arc_mfu->arcs_lsize[ARC_BUFC_DATA] > 0) {
1998 delta = MIN(adjustment, arc_mfu->arcs_lsize[ARC_BUFC_DATA]);
1999 (void) arc_evict(arc_mfu, 0, delta, FALSE, ARC_BUFC_DATA);
2000 adjustment -= delta;
2001 }
2002
2003 if (adjustment > 0 && arc_mfu->arcs_lsize[ARC_BUFC_METADATA] > 0) {
2004 int64_t delta = MIN(adjustment,
2005 arc_mfu->arcs_lsize[ARC_BUFC_METADATA]);
2006 (void) arc_evict(arc_mfu, 0, delta, FALSE,
2007 ARC_BUFC_METADATA);
2008 }
2009
2010 /*
2011 * Adjust ghost lists
2012 */
2013
2014 adjustment = arc_mru->arcs_size + arc_mru_ghost->arcs_size - arc_c;
2015
2016 if (adjustment > 0 && arc_mru_ghost->arcs_size > 0) {
2017 delta = MIN(arc_mru_ghost->arcs_size, adjustment);
2018 arc_evict_ghost(arc_mru_ghost, 0, delta);
2019 }
2020
2021 adjustment =
2022 arc_mru_ghost->arcs_size + arc_mfu_ghost->arcs_size - arc_c;
2023
2024 if (adjustment > 0 && arc_mfu_ghost->arcs_size > 0) {
2025 delta = MIN(arc_mfu_ghost->arcs_size, adjustment);
2026 arc_evict_ghost(arc_mfu_ghost, 0, delta);
2027 }
2028 }
2029
2030 /*
2031 * Request that arc user drop references so that N bytes can be released
2032 * from the cache. This provides a mechanism to ensure the arc can honor
2033 * the arc_meta_limit and reclaim buffers which are pinned in the cache
2034 * by higher layers. (i.e. the zpl)
2035 */
2036 static void
2037 arc_do_user_prune(int64_t adjustment)
2038 {
2039 arc_prune_func_t *func;
2040 void *private;
2041 arc_prune_t *cp, *np;
2042
2043 mutex_enter(&arc_prune_mtx);
2044
2045 cp = list_head(&arc_prune_list);
2046 while (cp != NULL) {
2047 func = cp->p_pfunc;
2048 private = cp->p_private;
2049 np = list_next(&arc_prune_list, cp);
2050 refcount_add(&cp->p_refcnt, func);
2051 mutex_exit(&arc_prune_mtx);
2052
2053 if (func != NULL)
2054 func(adjustment, private);
2055
2056 mutex_enter(&arc_prune_mtx);
2057
2058 /* User removed prune callback concurrently with execution */
2059 if (refcount_remove(&cp->p_refcnt, func) == 0) {
2060 ASSERT(!list_link_active(&cp->p_node));
2061 refcount_destroy(&cp->p_refcnt);
2062 kmem_free(cp, sizeof (*cp));
2063 }
2064
2065 cp = np;
2066 }
2067
2068 ARCSTAT_BUMP(arcstat_prune);
2069 mutex_exit(&arc_prune_mtx);
2070 }
2071
2072 static void
2073 arc_do_user_evicts(void)
2074 {
2075 mutex_enter(&arc_eviction_mtx);
2076 while (arc_eviction_list != NULL) {
2077 arc_buf_t *buf = arc_eviction_list;
2078 arc_eviction_list = buf->b_next;
2079 mutex_enter(&buf->b_evict_lock);
2080 buf->b_hdr = NULL;
2081 mutex_exit(&buf->b_evict_lock);
2082 mutex_exit(&arc_eviction_mtx);
2083
2084 if (buf->b_efunc != NULL)
2085 VERIFY(buf->b_efunc(buf) == 0);
2086
2087 buf->b_efunc = NULL;
2088 buf->b_private = NULL;
2089 kmem_cache_free(buf_cache, buf);
2090 mutex_enter(&arc_eviction_mtx);
2091 }
2092 mutex_exit(&arc_eviction_mtx);
2093 }
2094
2095 /*
2096 * Evict only meta data objects from the cache leaving the data objects.
2097 * This is only used to enforce the tunable arc_meta_limit, if we are
2098 * unable to evict enough buffers notify the user via the prune callback.
2099 */
2100 void
2101 arc_adjust_meta(int64_t adjustment, boolean_t may_prune)
2102 {
2103 int64_t delta;
2104
2105 if (adjustment > 0 && arc_mru->arcs_lsize[ARC_BUFC_METADATA] > 0) {
2106 delta = MIN(arc_mru->arcs_lsize[ARC_BUFC_METADATA], adjustment);
2107 arc_evict(arc_mru, 0, delta, FALSE, ARC_BUFC_METADATA);
2108 adjustment -= delta;
2109 }
2110
2111 if (adjustment > 0 && arc_mfu->arcs_lsize[ARC_BUFC_METADATA] > 0) {
2112 delta = MIN(arc_mfu->arcs_lsize[ARC_BUFC_METADATA], adjustment);
2113 arc_evict(arc_mfu, 0, delta, FALSE, ARC_BUFC_METADATA);
2114 adjustment -= delta;
2115 }
2116
2117 if (may_prune && (adjustment > 0) && (arc_meta_used > arc_meta_limit))
2118 arc_do_user_prune(zfs_arc_meta_prune);
2119 }
2120
2121 /*
2122 * Flush all *evictable* data from the cache for the given spa.
2123 * NOTE: this will not touch "active" (i.e. referenced) data.
2124 */
2125 void
2126 arc_flush(spa_t *spa)
2127 {
2128 uint64_t guid = 0;
2129
2130 if (spa)
2131 guid = spa_load_guid(spa);
2132
2133 while (list_head(&arc_mru->arcs_list[ARC_BUFC_DATA])) {
2134 (void) arc_evict(arc_mru, guid, -1, FALSE, ARC_BUFC_DATA);
2135 if (spa)
2136 break;
2137 }
2138 while (list_head(&arc_mru->arcs_list[ARC_BUFC_METADATA])) {
2139 (void) arc_evict(arc_mru, guid, -1, FALSE, ARC_BUFC_METADATA);
2140 if (spa)
2141 break;
2142 }
2143 while (list_head(&arc_mfu->arcs_list[ARC_BUFC_DATA])) {
2144 (void) arc_evict(arc_mfu, guid, -1, FALSE, ARC_BUFC_DATA);
2145 if (spa)
2146 break;
2147 }
2148 while (list_head(&arc_mfu->arcs_list[ARC_BUFC_METADATA])) {
2149 (void) arc_evict(arc_mfu, guid, -1, FALSE, ARC_BUFC_METADATA);
2150 if (spa)
2151 break;
2152 }
2153
2154 arc_evict_ghost(arc_mru_ghost, guid, -1);
2155 arc_evict_ghost(arc_mfu_ghost, guid, -1);
2156
2157 mutex_enter(&arc_reclaim_thr_lock);
2158 arc_do_user_evicts();
2159 mutex_exit(&arc_reclaim_thr_lock);
2160 ASSERT(spa || arc_eviction_list == NULL);
2161 }
2162
2163 void
2164 arc_shrink(uint64_t bytes)
2165 {
2166 if (arc_c > arc_c_min) {
2167 uint64_t to_free;
2168
2169 to_free = bytes ? bytes : arc_c >> zfs_arc_shrink_shift;
2170
2171 if (arc_c > arc_c_min + to_free)
2172 atomic_add_64(&arc_c, -to_free);
2173 else
2174 arc_c = arc_c_min;
2175
2176 atomic_add_64(&arc_p, -(arc_p >> zfs_arc_shrink_shift));
2177 if (arc_c > arc_size)
2178 arc_c = MAX(arc_size, arc_c_min);
2179 if (arc_p > arc_c)
2180 arc_p = (arc_c >> 1);
2181 ASSERT(arc_c >= arc_c_min);
2182 ASSERT((int64_t)arc_p >= 0);
2183 }
2184
2185 if (arc_size > arc_c)
2186 arc_adjust();
2187 }
2188
2189 static void
2190 arc_kmem_reap_now(arc_reclaim_strategy_t strat, uint64_t bytes)
2191 {
2192 size_t i;
2193 kmem_cache_t *prev_cache = NULL;
2194 kmem_cache_t *prev_data_cache = NULL;
2195 extern kmem_cache_t *zio_buf_cache[];
2196 extern kmem_cache_t *zio_data_buf_cache[];
2197
2198 /*
2199 * An aggressive reclamation will shrink the cache size as well as
2200 * reap free buffers from the arc kmem caches.
2201 */
2202 if (strat == ARC_RECLAIM_AGGR)
2203 arc_shrink(bytes);
2204
2205 for (i = 0; i < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; i++) {
2206 if (zio_buf_cache[i] != prev_cache) {
2207 prev_cache = zio_buf_cache[i];
2208 kmem_cache_reap_now(zio_buf_cache[i]);
2209 }
2210 if (zio_data_buf_cache[i] != prev_data_cache) {
2211 prev_data_cache = zio_data_buf_cache[i];
2212 kmem_cache_reap_now(zio_data_buf_cache[i]);
2213 }
2214 }
2215
2216 kmem_cache_reap_now(buf_cache);
2217 kmem_cache_reap_now(hdr_cache);
2218 }
2219
2220 /*
2221 * Unlike other ZFS implementations this thread is only responsible for
2222 * adapting the target ARC size on Linux. The responsibility for memory
2223 * reclamation has been entirely delegated to the arc_shrinker_func()
2224 * which is registered with the VM. To reflect this change in behavior
2225 * the arc_reclaim thread has been renamed to arc_adapt.
2226 */
2227 static void
2228 arc_adapt_thread(void)
2229 {
2230 callb_cpr_t cpr;
2231 int64_t prune;
2232
2233 CALLB_CPR_INIT(&cpr, &arc_reclaim_thr_lock, callb_generic_cpr, FTAG);
2234
2235 mutex_enter(&arc_reclaim_thr_lock);
2236 while (arc_thread_exit == 0) {
2237 #ifndef _KERNEL
2238 arc_reclaim_strategy_t last_reclaim = ARC_RECLAIM_CONS;
2239
2240 if (spa_get_random(100) == 0) {
2241
2242 if (arc_no_grow) {
2243 if (last_reclaim == ARC_RECLAIM_CONS) {
2244 last_reclaim = ARC_RECLAIM_AGGR;
2245 } else {
2246 last_reclaim = ARC_RECLAIM_CONS;
2247 }
2248 } else {
2249 arc_no_grow = TRUE;
2250 last_reclaim = ARC_RECLAIM_AGGR;
2251 membar_producer();
2252 }
2253
2254 /* reset the growth delay for every reclaim */
2255 arc_grow_time = ddi_get_lbolt()+(zfs_arc_grow_retry * hz);
2256
2257 arc_kmem_reap_now(last_reclaim, 0);
2258 arc_warm = B_TRUE;
2259 }
2260 #endif /* !_KERNEL */
2261
2262 /* No recent memory pressure allow the ARC to grow. */
2263 if (arc_no_grow && ddi_get_lbolt() >= arc_grow_time)
2264 arc_no_grow = FALSE;
2265
2266 /*
2267 * Keep meta data usage within limits, arc_shrink() is not
2268 * used to avoid collapsing the arc_c value when only the
2269 * arc_meta_limit is being exceeded.
2270 */
2271 prune = (int64_t)arc_meta_used - (int64_t)arc_meta_limit;
2272 if (prune > 0)
2273 arc_adjust_meta(prune, B_TRUE);
2274
2275 arc_adjust();
2276
2277 if (arc_eviction_list != NULL)
2278 arc_do_user_evicts();
2279
2280 /* block until needed, or one second, whichever is shorter */
2281 CALLB_CPR_SAFE_BEGIN(&cpr);
2282 (void) cv_timedwait_interruptible(&arc_reclaim_thr_cv,
2283 &arc_reclaim_thr_lock, (ddi_get_lbolt() + hz));
2284 CALLB_CPR_SAFE_END(&cpr, &arc_reclaim_thr_lock);
2285
2286
2287 /* Allow the module options to be changed */
2288 if (zfs_arc_max > 64 << 20 &&
2289 zfs_arc_max < physmem * PAGESIZE &&
2290 zfs_arc_max != arc_c_max)
2291 arc_c_max = zfs_arc_max;
2292
2293 if (zfs_arc_min > 0 &&
2294 zfs_arc_min < arc_c_max &&
2295 zfs_arc_min != arc_c_min)
2296 arc_c_min = zfs_arc_min;
2297
2298 if (zfs_arc_meta_limit > 0 &&
2299 zfs_arc_meta_limit <= arc_c_max &&
2300 zfs_arc_meta_limit != arc_meta_limit)
2301 arc_meta_limit = zfs_arc_meta_limit;
2302
2303
2304
2305 }
2306
2307 arc_thread_exit = 0;
2308 cv_broadcast(&arc_reclaim_thr_cv);
2309 CALLB_CPR_EXIT(&cpr); /* drops arc_reclaim_thr_lock */
2310 thread_exit();
2311 }
2312
2313 #ifdef _KERNEL
2314 /*
2315 * Determine the amount of memory eligible for eviction contained in the
2316 * ARC. All clean data reported by the ghost lists can always be safely
2317 * evicted. Due to arc_c_min, the same does not hold for all clean data
2318 * contained by the regular mru and mfu lists.
2319 *
2320 * In the case of the regular mru and mfu lists, we need to report as
2321 * much clean data as possible, such that evicting that same reported
2322 * data will not bring arc_size below arc_c_min. Thus, in certain
2323 * circumstances, the total amount of clean data in the mru and mfu
2324 * lists might not actually be evictable.
2325 *
2326 * The following two distinct cases are accounted for:
2327 *
2328 * 1. The sum of the amount of dirty data contained by both the mru and
2329 * mfu lists, plus the ARC's other accounting (e.g. the anon list),
2330 * is greater than or equal to arc_c_min.
2331 * (i.e. amount of dirty data >= arc_c_min)
2332 *
2333 * This is the easy case; all clean data contained by the mru and mfu
2334 * lists is evictable. Evicting all clean data can only drop arc_size
2335 * to the amount of dirty data, which is greater than arc_c_min.
2336 *
2337 * 2. The sum of the amount of dirty data contained by both the mru and
2338 * mfu lists, plus the ARC's other accounting (e.g. the anon list),
2339 * is less than arc_c_min.
2340 * (i.e. arc_c_min > amount of dirty data)
2341 *
2342 * 2.1. arc_size is greater than or equal arc_c_min.
2343 * (i.e. arc_size >= arc_c_min > amount of dirty data)
2344 *
2345 * In this case, not all clean data from the regular mru and mfu
2346 * lists is actually evictable; we must leave enough clean data
2347 * to keep arc_size above arc_c_min. Thus, the maximum amount of
2348 * evictable data from the two lists combined, is exactly the
2349 * difference between arc_size and arc_c_min.
2350 *
2351 * 2.2. arc_size is less than arc_c_min
2352 * (i.e. arc_c_min > arc_size > amount of dirty data)
2353 *
2354 * In this case, none of the data contained in the mru and mfu
2355 * lists is evictable, even if it's clean. Since arc_size is
2356 * already below arc_c_min, evicting any more would only
2357 * increase this negative difference.
2358 */
2359 static uint64_t
2360 arc_evictable_memory(void) {
2361 uint64_t arc_clean =
2362 arc_mru->arcs_lsize[ARC_BUFC_DATA] +
2363 arc_mru->arcs_lsize[ARC_BUFC_METADATA] +
2364 arc_mfu->arcs_lsize[ARC_BUFC_DATA] +
2365 arc_mfu->arcs_lsize[ARC_BUFC_METADATA];
2366 uint64_t ghost_clean =
2367 arc_mru_ghost->arcs_lsize[ARC_BUFC_DATA] +
2368 arc_mru_ghost->arcs_lsize[ARC_BUFC_METADATA] +
2369 arc_mfu_ghost->arcs_lsize[ARC_BUFC_DATA] +
2370 arc_mfu_ghost->arcs_lsize[ARC_BUFC_METADATA];
2371 uint64_t arc_dirty = MAX((int64_t)arc_size - (int64_t)arc_clean, 0);
2372
2373 if (arc_dirty >= arc_c_min)
2374 return (ghost_clean + arc_clean);
2375
2376 return (ghost_clean + MAX((int64_t)arc_size - (int64_t)arc_c_min, 0));
2377 }
2378
2379 static int
2380 __arc_shrinker_func(struct shrinker *shrink, struct shrink_control *sc)
2381 {
2382 uint64_t pages;
2383
2384 /* The arc is considered warm once reclaim has occurred */
2385 if (unlikely(arc_warm == B_FALSE))
2386 arc_warm = B_TRUE;
2387
2388 /* Return the potential number of reclaimable pages */
2389 pages = btop(arc_evictable_memory());
2390 if (sc->nr_to_scan == 0)
2391 return (pages);
2392
2393 /* Not allowed to perform filesystem reclaim */
2394 if (!(sc->gfp_mask & __GFP_FS))
2395 return (-1);
2396
2397 /* Reclaim in progress */
2398 if (mutex_tryenter(&arc_reclaim_thr_lock) == 0)
2399 return (-1);
2400
2401 /*
2402 * Evict the requested number of pages by shrinking arc_c the
2403 * requested amount. If there is nothing left to evict just
2404 * reap whatever we can from the various arc slabs.
2405 */
2406 if (pages > 0) {
2407 arc_kmem_reap_now(ARC_RECLAIM_AGGR, ptob(sc->nr_to_scan));
2408 } else {
2409 arc_kmem_reap_now(ARC_RECLAIM_CONS, ptob(sc->nr_to_scan));
2410 }
2411
2412 /*
2413 * When direct reclaim is observed it usually indicates a rapid
2414 * increase in memory pressure. This occurs because the kswapd
2415 * threads were unable to asynchronously keep enough free memory
2416 * available. In this case set arc_no_grow to briefly pause arc
2417 * growth to avoid compounding the memory pressure.
2418 */
2419 if (current_is_kswapd()) {
2420 ARCSTAT_BUMP(arcstat_memory_indirect_count);
2421 } else {
2422 arc_no_grow = B_TRUE;
2423 arc_grow_time = ddi_get_lbolt() + (zfs_arc_grow_retry * hz);
2424 ARCSTAT_BUMP(arcstat_memory_direct_count);
2425 }
2426
2427 mutex_exit(&arc_reclaim_thr_lock);
2428
2429 return (-1);
2430 }
2431 SPL_SHRINKER_CALLBACK_WRAPPER(arc_shrinker_func);
2432
2433 SPL_SHRINKER_DECLARE(arc_shrinker, arc_shrinker_func, DEFAULT_SEEKS);
2434 #endif /* _KERNEL */
2435
2436 /*
2437 * Adapt arc info given the number of bytes we are trying to add and
2438 * the state that we are comming from. This function is only called
2439 * when we are adding new content to the cache.
2440 */
2441 static void
2442 arc_adapt(int bytes, arc_state_t *state)
2443 {
2444 int mult;
2445 uint64_t arc_p_min = (arc_c >> zfs_arc_p_min_shift);
2446
2447 if (state == arc_l2c_only)
2448 return;
2449
2450 ASSERT(bytes > 0);
2451 /*
2452 * Adapt the target size of the MRU list:
2453 * - if we just hit in the MRU ghost list, then increase
2454 * the target size of the MRU list.
2455 * - if we just hit in the MFU ghost list, then increase
2456 * the target size of the MFU list by decreasing the
2457 * target size of the MRU list.
2458 */
2459 if (state == arc_mru_ghost) {
2460 mult = ((arc_mru_ghost->arcs_size >= arc_mfu_ghost->arcs_size) ?
2461 1 : (arc_mfu_ghost->arcs_size/arc_mru_ghost->arcs_size));
2462 mult = MIN(mult, 10); /* avoid wild arc_p adjustment */
2463
2464 arc_p = MIN(arc_c - arc_p_min, arc_p + bytes * mult);
2465 } else if (state == arc_mfu_ghost) {
2466 uint64_t delta;
2467
2468 mult = ((arc_mfu_ghost->arcs_size >= arc_mru_ghost->arcs_size) ?
2469 1 : (arc_mru_ghost->arcs_size/arc_mfu_ghost->arcs_size));
2470 mult = MIN(mult, 10);
2471
2472 delta = MIN(bytes * mult, arc_p);
2473 arc_p = MAX(arc_p_min, arc_p - delta);
2474 }
2475 ASSERT((int64_t)arc_p >= 0);
2476
2477 if (arc_no_grow)
2478 return;
2479
2480 if (arc_c >= arc_c_max)
2481 return;
2482
2483 /*
2484 * If we're within (2 * maxblocksize) bytes of the target
2485 * cache size, increment the target cache size
2486 */
2487 if (arc_size > arc_c - (2ULL << SPA_MAXBLOCKSHIFT)) {
2488 atomic_add_64(&arc_c, (int64_t)bytes);
2489 if (arc_c > arc_c_max)
2490 arc_c = arc_c_max;
2491 else if (state == arc_anon)
2492 atomic_add_64(&arc_p, (int64_t)bytes);
2493 if (arc_p > arc_c)
2494 arc_p = arc_c;
2495 }
2496 ASSERT((int64_t)arc_p >= 0);
2497 }
2498
2499 /*
2500 * Check if the cache has reached its limits and eviction is required
2501 * prior to insert.
2502 */
2503 static int
2504 arc_evict_needed(arc_buf_contents_t type)
2505 {
2506 if (type == ARC_BUFC_METADATA && arc_meta_used >= arc_meta_limit)
2507 return (1);
2508
2509 if (arc_no_grow)
2510 return (1);
2511
2512 return (arc_size > arc_c);
2513 }
2514
2515 /*
2516 * The buffer, supplied as the first argument, needs a data block.
2517 * So, if we are at cache max, determine which cache should be victimized.
2518 * We have the following cases:
2519 *
2520 * 1. Insert for MRU, p > sizeof(arc_anon + arc_mru) ->
2521 * In this situation if we're out of space, but the resident size of the MFU is
2522 * under the limit, victimize the MFU cache to satisfy this insertion request.
2523 *
2524 * 2. Insert for MRU, p <= sizeof(arc_anon + arc_mru) ->
2525 * Here, we've used up all of the available space for the MRU, so we need to
2526 * evict from our own cache instead. Evict from the set of resident MRU
2527 * entries.
2528 *
2529 * 3. Insert for MFU (c - p) > sizeof(arc_mfu) ->
2530 * c minus p represents the MFU space in the cache, since p is the size of the
2531 * cache that is dedicated to the MRU. In this situation there's still space on
2532 * the MFU side, so the MRU side needs to be victimized.
2533 *
2534 * 4. Insert for MFU (c - p) < sizeof(arc_mfu) ->
2535 * MFU's resident set is consuming more space than it has been allotted. In
2536 * this situation, we must victimize our own cache, the MFU, for this insertion.
2537 */
2538 static void
2539 arc_get_data_buf(arc_buf_t *buf)
2540 {
2541 arc_state_t *state = buf->b_hdr->b_state;
2542 uint64_t size = buf->b_hdr->b_size;
2543 arc_buf_contents_t type = buf->b_hdr->b_type;
2544
2545 arc_adapt(size, state);
2546
2547 /*
2548 * We have not yet reached cache maximum size,
2549 * just allocate a new buffer.
2550 */
2551 if (!arc_evict_needed(type)) {
2552 if (type == ARC_BUFC_METADATA) {
2553 buf->b_data = zio_buf_alloc(size);
2554 arc_space_consume(size, ARC_SPACE_DATA);
2555 } else {
2556 ASSERT(type == ARC_BUFC_DATA);
2557 buf->b_data = zio_data_buf_alloc(size);
2558 ARCSTAT_INCR(arcstat_data_size, size);
2559 atomic_add_64(&arc_size, size);
2560 }
2561 goto out;
2562 }
2563
2564 /*
2565 * If we are prefetching from the mfu ghost list, this buffer
2566 * will end up on the mru list; so steal space from there.
2567 */
2568 if (state == arc_mfu_ghost)
2569 state = buf->b_hdr->b_flags & ARC_PREFETCH ? arc_mru : arc_mfu;
2570 else if (state == arc_mru_ghost)
2571 state = arc_mru;
2572
2573 if (state == arc_mru || state == arc_anon) {
2574 uint64_t mru_used = arc_anon->arcs_size + arc_mru->arcs_size;
2575 state = (arc_mfu->arcs_lsize[type] >= size &&
2576 arc_p > mru_used) ? arc_mfu : arc_mru;
2577 } else {
2578 /* MFU cases */
2579 uint64_t mfu_space = arc_c - arc_p;
2580 state = (arc_mru->arcs_lsize[type] >= size &&
2581 mfu_space > arc_mfu->arcs_size) ? arc_mru : arc_mfu;
2582 }
2583
2584 if ((buf->b_data = arc_evict(state, 0, size, TRUE, type)) == NULL) {
2585 if (type == ARC_BUFC_METADATA) {
2586 buf->b_data = zio_buf_alloc(size);
2587 arc_space_consume(size, ARC_SPACE_DATA);
2588
2589 /*
2590 * If we are unable to recycle an existing meta buffer
2591 * signal the reclaim thread. It will notify users
2592 * via the prune callback to drop references. The
2593 * prune callback in run in the context of the reclaim
2594 * thread to avoid deadlocking on the hash_lock.
2595 */
2596 cv_signal(&arc_reclaim_thr_cv);
2597 } else {
2598 ASSERT(type == ARC_BUFC_DATA);
2599 buf->b_data = zio_data_buf_alloc(size);
2600 ARCSTAT_INCR(arcstat_data_size, size);
2601 atomic_add_64(&arc_size, size);
2602 }
2603
2604 ARCSTAT_BUMP(arcstat_recycle_miss);
2605 }
2606 ASSERT(buf->b_data != NULL);
2607 out:
2608 /*
2609 * Update the state size. Note that ghost states have a
2610 * "ghost size" and so don't need to be updated.
2611 */
2612 if (!GHOST_STATE(buf->b_hdr->b_state)) {
2613 arc_buf_hdr_t *hdr = buf->b_hdr;
2614
2615 atomic_add_64(&hdr->b_state->arcs_size, size);
2616 if (list_link_active(&hdr->b_arc_node)) {
2617 ASSERT(refcount_is_zero(&hdr->b_refcnt));
2618 atomic_add_64(&hdr->b_state->arcs_lsize[type], size);
2619 }
2620 /*
2621 * If we are growing the cache, and we are adding anonymous
2622 * data, and we have outgrown arc_p, update arc_p
2623 */
2624 if (arc_size < arc_c && hdr->b_state == arc_anon &&
2625 arc_anon->arcs_size + arc_mru->arcs_size > arc_p)
2626 arc_p = MIN(arc_c, arc_p + size);
2627 }
2628 }
2629
2630 /*
2631 * This routine is called whenever a buffer is accessed.
2632 * NOTE: the hash lock is dropped in this function.
2633 */
2634 static void
2635 arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock)
2636 {
2637 clock_t now;
2638
2639 ASSERT(MUTEX_HELD(hash_lock));
2640
2641 if (buf->b_state == arc_anon) {
2642 /*
2643 * This buffer is not in the cache, and does not
2644 * appear in our "ghost" list. Add the new buffer
2645 * to the MRU state.
2646 */
2647
2648 ASSERT(buf->b_arc_access == 0);
2649 buf->b_arc_access = ddi_get_lbolt();
2650 DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, buf);
2651 arc_change_state(arc_mru, buf, hash_lock);
2652
2653 } else if (buf->b_state == arc_mru) {
2654 now = ddi_get_lbolt();
2655
2656 /*
2657 * If this buffer is here because of a prefetch, then either:
2658 * - clear the flag if this is a "referencing" read
2659 * (any subsequent access will bump this into the MFU state).
2660 * or
2661 * - move the buffer to the head of the list if this is
2662 * another prefetch (to make it less likely to be evicted).
2663 */
2664 if ((buf->b_flags & ARC_PREFETCH) != 0) {
2665 if (refcount_count(&buf->b_refcnt) == 0) {
2666 ASSERT(list_link_active(&buf->b_arc_node));
2667 } else {
2668 buf->b_flags &= ~ARC_PREFETCH;
2669 ARCSTAT_BUMP(arcstat_mru_hits);
2670 }
2671 buf->b_arc_access = now;
2672 return;
2673 }
2674
2675 /*
2676 * This buffer has been "accessed" only once so far,
2677 * but it is still in the cache. Move it to the MFU
2678 * state.
2679 */
2680 if (now > buf->b_arc_access + ARC_MINTIME) {
2681 /*
2682 * More than 125ms have passed since we
2683 * instantiated this buffer. Move it to the
2684 * most frequently used state.
2685 */
2686 buf->b_arc_access = now;
2687 DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf);
2688 arc_change_state(arc_mfu, buf, hash_lock);
2689 }
2690 ARCSTAT_BUMP(arcstat_mru_hits);
2691 } else if (buf->b_state == arc_mru_ghost) {
2692 arc_state_t *new_state;
2693 /*
2694 * This buffer has been "accessed" recently, but
2695 * was evicted from the cache. Move it to the
2696 * MFU state.
2697 */
2698
2699 if (buf->b_flags & ARC_PREFETCH) {
2700 new_state = arc_mru;
2701 if (refcount_count(&buf->b_refcnt) > 0)
2702 buf->b_flags &= ~ARC_PREFETCH;
2703 DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, buf);
2704 } else {
2705 new_state = arc_mfu;
2706 DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf);
2707 }
2708
2709 buf->b_arc_access = ddi_get_lbolt();
2710 arc_change_state(new_state, buf, hash_lock);
2711
2712 ARCSTAT_BUMP(arcstat_mru_ghost_hits);
2713 } else if (buf->b_state == arc_mfu) {
2714 /*
2715 * This buffer has been accessed more than once and is
2716 * still in the cache. Keep it in the MFU state.
2717 *
2718 * NOTE: an add_reference() that occurred when we did
2719 * the arc_read() will have kicked this off the list.
2720 * If it was a prefetch, we will explicitly move it to
2721 * the head of the list now.
2722 */
2723 if ((buf->b_flags & ARC_PREFETCH) != 0) {
2724 ASSERT(refcount_count(&buf->b_refcnt) == 0);
2725 ASSERT(list_link_active(&buf->b_arc_node));
2726 }
2727 ARCSTAT_BUMP(arcstat_mfu_hits);
2728 buf->b_arc_access = ddi_get_lbolt();
2729 } else if (buf->b_state == arc_mfu_ghost) {
2730 arc_state_t *new_state = arc_mfu;
2731 /*
2732 * This buffer has been accessed more than once but has
2733 * been evicted from the cache. Move it back to the
2734 * MFU state.
2735 */
2736
2737 if (buf->b_flags & ARC_PREFETCH) {
2738 /*
2739 * This is a prefetch access...
2740 * move this block back to the MRU state.
2741 */
2742 ASSERT0(refcount_count(&buf->b_refcnt));
2743 new_state = arc_mru;
2744 }
2745
2746 buf->b_arc_access = ddi_get_lbolt();
2747 DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf);
2748 arc_change_state(new_state, buf, hash_lock);
2749
2750 ARCSTAT_BUMP(arcstat_mfu_ghost_hits);
2751 } else if (buf->b_state == arc_l2c_only) {
2752 /*
2753 * This buffer is on the 2nd Level ARC.
2754 */
2755
2756 buf->b_arc_access = ddi_get_lbolt();
2757 DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf);
2758 arc_change_state(arc_mfu, buf, hash_lock);
2759 } else {
2760 ASSERT(!"invalid arc state");
2761 }
2762 }
2763
2764 /* a generic arc_done_func_t which you can use */
2765 /* ARGSUSED */
2766 void
2767 arc_bcopy_func(zio_t *zio, arc_buf_t *buf, void *arg)
2768 {
2769 if (zio == NULL || zio->io_error == 0)
2770 bcopy(buf->b_data, arg, buf->b_hdr->b_size);
2771 VERIFY(arc_buf_remove_ref(buf, arg) == 1);
2772 }
2773
2774 /* a generic arc_done_func_t */
2775 void
2776 arc_getbuf_func(zio_t *zio, arc_buf_t *buf, void *arg)
2777 {
2778 arc_buf_t **bufp = arg;
2779 if (zio && zio->io_error) {
2780 VERIFY(arc_buf_remove_ref(buf, arg) == 1);
2781 *bufp = NULL;
2782 } else {
2783 *bufp = buf;
2784 ASSERT(buf->b_data);
2785 }
2786 }
2787
2788 static void
2789 arc_read_done(zio_t *zio)
2790 {
2791 arc_buf_hdr_t *hdr, *found;
2792 arc_buf_t *buf;
2793 arc_buf_t *abuf; /* buffer we're assigning to callback */
2794 kmutex_t *hash_lock;
2795 arc_callback_t *callback_list, *acb;
2796 int freeable = FALSE;
2797
2798 buf = zio->io_private;
2799 hdr = buf->b_hdr;
2800
2801 /*
2802 * The hdr was inserted into hash-table and removed from lists
2803 * prior to starting I/O. We should find this header, since
2804 * it's in the hash table, and it should be legit since it's
2805 * not possible to evict it during the I/O. The only possible
2806 * reason for it not to be found is if we were freed during the
2807 * read.
2808 */
2809 found = buf_hash_find(hdr->b_spa, &hdr->b_dva, hdr->b_birth,
2810 &hash_lock);
2811
2812 ASSERT((found == NULL && HDR_FREED_IN_READ(hdr) && hash_lock == NULL) ||
2813 (found == hdr && DVA_EQUAL(&hdr->b_dva, BP_IDENTITY(zio->io_bp))) ||
2814 (found == hdr && HDR_L2_READING(hdr)));
2815
2816 hdr->b_flags &= ~ARC_L2_EVICTED;
2817 if (l2arc_noprefetch && (hdr->b_flags & ARC_PREFETCH))
2818 hdr->b_flags &= ~ARC_L2CACHE;
2819
2820 /* byteswap if necessary */
2821 callback_list = hdr->b_acb;
2822 ASSERT(callback_list != NULL);
2823 if (BP_SHOULD_BYTESWAP(zio->io_bp) && zio->io_error == 0) {
2824 dmu_object_byteswap_t bswap =
2825 DMU_OT_BYTESWAP(BP_GET_TYPE(zio->io_bp));
2826 if (BP_GET_LEVEL(zio->io_bp) > 0)
2827 byteswap_uint64_array(buf->b_data, hdr->b_size);
2828 else
2829 dmu_ot_byteswap[bswap].ob_func(buf->b_data, hdr->b_size);
2830 }
2831
2832 arc_cksum_compute(buf, B_FALSE);
2833
2834 if (hash_lock && zio->io_error == 0 && hdr->b_state == arc_anon) {
2835 /*
2836 * Only call arc_access on anonymous buffers. This is because
2837 * if we've issued an I/O for an evicted buffer, we've already
2838 * called arc_access (to prevent any simultaneous readers from
2839 * getting confused).
2840 */
2841 arc_access(hdr, hash_lock);
2842 }
2843
2844 /* create copies of the data buffer for the callers */
2845 abuf = buf;
2846 for (acb = callback_list; acb; acb = acb->acb_next) {
2847 if (acb->acb_done) {
2848 if (abuf == NULL) {
2849 ARCSTAT_BUMP(arcstat_duplicate_reads);
2850 abuf = arc_buf_clone(buf);
2851 }
2852 acb->acb_buf = abuf;
2853 abuf = NULL;
2854 }
2855 }
2856 hdr->b_acb = NULL;
2857 hdr->b_flags &= ~ARC_IO_IN_PROGRESS;
2858 ASSERT(!HDR_BUF_AVAILABLE(hdr));
2859 if (abuf == buf) {
2860 ASSERT(buf->b_efunc == NULL);
2861 ASSERT(hdr->b_datacnt == 1);
2862 hdr->b_flags |= ARC_BUF_AVAILABLE;
2863 }
2864
2865 ASSERT(refcount_is_zero(&hdr->b_refcnt) || callback_list != NULL);
2866
2867 if (zio->io_error != 0) {
2868 hdr->b_flags |= ARC_IO_ERROR;
2869 if (hdr->b_state != arc_anon)
2870 arc_change_state(arc_anon, hdr, hash_lock);
2871 if (HDR_IN_HASH_TABLE(hdr))
2872 buf_hash_remove(hdr);
2873 freeable = refcount_is_zero(&hdr->b_refcnt);
2874 }
2875
2876 /*
2877 * Broadcast before we drop the hash_lock to avoid the possibility
2878 * that the hdr (and hence the cv) might be freed before we get to
2879 * the cv_broadcast().
2880 */
2881 cv_broadcast(&hdr->b_cv);
2882
2883 if (hash_lock) {
2884 mutex_exit(hash_lock);
2885 } else {
2886 /*
2887 * This block was freed while we waited for the read to
2888 * complete. It has been removed from the hash table and
2889 * moved to the anonymous state (so that it won't show up
2890 * in the cache).
2891 */
2892 ASSERT3P(hdr->b_state, ==, arc_anon);
2893 freeable = refcount_is_zero(&hdr->b_refcnt);
2894 }
2895
2896 /* execute each callback and free its structure */
2897 while ((acb = callback_list) != NULL) {
2898 if (acb->acb_done)
2899 acb->acb_done(zio, acb->acb_buf, acb->acb_private);
2900
2901 if (acb->acb_zio_dummy != NULL) {
2902 acb->acb_zio_dummy->io_error = zio->io_error;
2903 zio_nowait(acb->acb_zio_dummy);
2904 }
2905
2906 callback_list = acb->acb_next;
2907 kmem_free(acb, sizeof (arc_callback_t));
2908 }
2909
2910 if (freeable)
2911 arc_hdr_destroy(hdr);
2912 }
2913
2914 /*
2915 * "Read" the block at the specified DVA (in bp) via the
2916 * cache. If the block is found in the cache, invoke the provided
2917 * callback immediately and return. Note that the `zio' parameter
2918 * in the callback will be NULL in this case, since no IO was
2919 * required. If the block is not in the cache pass the read request
2920 * on to the spa with a substitute callback function, so that the
2921 * requested block will be added to the cache.
2922 *
2923 * If a read request arrives for a block that has a read in-progress,
2924 * either wait for the in-progress read to complete (and return the
2925 * results); or, if this is a read with a "done" func, add a record
2926 * to the read to invoke the "done" func when the read completes,
2927 * and return; or just return.
2928 *
2929 * arc_read_done() will invoke all the requested "done" functions
2930 * for readers of this block.
2931 */
2932 int
2933 arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_done_func_t *done,
2934 void *private, int priority, int zio_flags, uint32_t *arc_flags,
2935 const zbookmark_t *zb)
2936 {
2937 arc_buf_hdr_t *hdr;
2938 arc_buf_t *buf = NULL;
2939 kmutex_t *hash_lock;
2940 zio_t *rzio;
2941 uint64_t guid = spa_load_guid(spa);
2942
2943 top:
2944 hdr = buf_hash_find(guid, BP_IDENTITY(bp), BP_PHYSICAL_BIRTH(bp),
2945 &hash_lock);
2946 if (hdr && hdr->b_datacnt > 0) {
2947
2948 *arc_flags |= ARC_CACHED;
2949
2950 if (HDR_IO_IN_PROGRESS(hdr)) {
2951
2952 if (*arc_flags & ARC_WAIT) {
2953 cv_wait(&hdr->b_cv, hash_lock);
2954 mutex_exit(hash_lock);
2955 goto top;
2956 }
2957 ASSERT(*arc_flags & ARC_NOWAIT);
2958
2959 if (done) {
2960 arc_callback_t *acb = NULL;
2961
2962 acb = kmem_zalloc(sizeof (arc_callback_t),
2963 KM_PUSHPAGE);
2964 acb->acb_done = done;
2965 acb->acb_private = private;
2966 if (pio != NULL)
2967 acb->acb_zio_dummy = zio_null(pio,
2968 spa, NULL, NULL, NULL, zio_flags);
2969
2970 ASSERT(acb->acb_done != NULL);
2971 acb->acb_next = hdr->b_acb;
2972 hdr->b_acb = acb;
2973 add_reference(hdr, hash_lock, private);
2974 mutex_exit(hash_lock);
2975 return (0);
2976 }
2977 mutex_exit(hash_lock);
2978 return (0);
2979 }
2980
2981 ASSERT(hdr->b_state == arc_mru || hdr->b_state == arc_mfu);
2982
2983 if (done) {
2984 add_reference(hdr, hash_lock, private);
2985 /*
2986 * If this block is already in use, create a new
2987 * copy of the data so that we will be guaranteed
2988 * that arc_release() will always succeed.
2989 */
2990 buf = hdr->b_buf;
2991 ASSERT(buf);
2992 ASSERT(buf->b_data);
2993 if (HDR_BUF_AVAILABLE(hdr)) {
2994 ASSERT(buf->b_efunc == NULL);
2995 hdr->b_flags &= ~ARC_BUF_AVAILABLE;
2996 } else {
2997 buf = arc_buf_clone(buf);
2998 }
2999
3000 } else if (*arc_flags & ARC_PREFETCH &&
3001 refcount_count(&hdr->b_refcnt) == 0) {
3002 hdr->b_flags |= ARC_PREFETCH;
3003 }
3004 DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr);
3005 arc_access(hdr, hash_lock);
3006 if (*arc_flags & ARC_L2CACHE)
3007 hdr->b_flags |= ARC_L2CACHE;
3008 if (*arc_flags & ARC_L2COMPRESS)
3009 hdr->b_flags |= ARC_L2COMPRESS;
3010 mutex_exit(hash_lock);
3011 ARCSTAT_BUMP(arcstat_hits);
3012 ARCSTAT_CONDSTAT(!(hdr->b_flags & ARC_PREFETCH),
3013 demand, prefetch, hdr->b_type != ARC_BUFC_METADATA,
3014 data, metadata, hits);
3015
3016 if (done)
3017 done(NULL, buf, private);
3018 } else {
3019 uint64_t size = BP_GET_LSIZE(bp);
3020 arc_callback_t *acb;
3021 vdev_t *vd = NULL;
3022 uint64_t addr = -1;
3023 boolean_t devw = B_FALSE;
3024
3025 if (hdr == NULL) {
3026 /* this block is not in the cache */
3027 arc_buf_hdr_t *exists;
3028 arc_buf_contents_t type = BP_GET_BUFC_TYPE(bp);
3029 buf = arc_buf_alloc(spa, size, private, type);
3030 hdr = buf->b_hdr;
3031 hdr->b_dva = *BP_IDENTITY(bp);
3032 hdr->b_birth = BP_PHYSICAL_BIRTH(bp);
3033 hdr->b_cksum0 = bp->blk_cksum.zc_word[0];
3034 exists = buf_hash_insert(hdr, &hash_lock);
3035 if (exists) {
3036 /* somebody beat us to the hash insert */
3037 mutex_exit(hash_lock);
3038 buf_discard_identity(hdr);
3039 (void) arc_buf_remove_ref(buf, private);
3040 goto top; /* restart the IO request */
3041 }
3042 /* if this is a prefetch, we don't have a reference */
3043 if (*arc_flags & ARC_PREFETCH) {
3044 (void) remove_reference(hdr, hash_lock,
3045 private);
3046 hdr->b_flags |= ARC_PREFETCH;
3047 }
3048 if (*arc_flags & ARC_L2CACHE)
3049 hdr->b_flags |= ARC_L2CACHE;
3050 if (*arc_flags & ARC_L2COMPRESS)
3051 hdr->b_flags |= ARC_L2COMPRESS;
3052 if (BP_GET_LEVEL(bp) > 0)
3053 hdr->b_flags |= ARC_INDIRECT;
3054 } else {
3055 /* this block is in the ghost cache */
3056 ASSERT(GHOST_STATE(hdr->b_state));
3057 ASSERT(!HDR_IO_IN_PROGRESS(hdr));
3058 ASSERT0(refcount_count(&hdr->b_refcnt));
3059 ASSERT(hdr->b_buf == NULL);
3060
3061 /* if this is a prefetch, we don't have a reference */
3062 if (*arc_flags & ARC_PREFETCH)
3063 hdr->b_flags |= ARC_PREFETCH;
3064 else
3065 add_reference(hdr, hash_lock, private);
3066 if (*arc_flags & ARC_L2CACHE)
3067 hdr->b_flags |= ARC_L2CACHE;
3068 if (*arc_flags & ARC_L2COMPRESS)
3069 hdr->b_flags |= ARC_L2COMPRESS;
3070 buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE);
3071 buf->b_hdr = hdr;
3072 buf->b_data = NULL;
3073 buf->b_efunc = NULL;
3074 buf->b_private = NULL;
3075 buf->b_next = NULL;
3076 hdr->b_buf = buf;
3077 ASSERT(hdr->b_datacnt == 0);
3078 hdr->b_datacnt = 1;
3079 arc_get_data_buf(buf);
3080 arc_access(hdr, hash_lock);
3081 }
3082
3083 ASSERT(!GHOST_STATE(hdr->b_state));
3084
3085 acb = kmem_zalloc(sizeof (arc_callback_t), KM_PUSHPAGE);
3086 acb->acb_done = done;
3087 acb->acb_private = private;
3088
3089 ASSERT(hdr->b_acb == NULL);
3090 hdr->b_acb = acb;
3091 hdr->b_flags |= ARC_IO_IN_PROGRESS;
3092
3093 if (HDR_L2CACHE(hdr) && hdr->b_l2hdr != NULL &&
3094 (vd = hdr->b_l2hdr->b_dev->l2ad_vdev) != NULL) {
3095 devw = hdr->b_l2hdr->b_dev->l2ad_writing;
3096 addr = hdr->b_l2hdr->b_daddr;
3097 /*
3098 * Lock out device removal.
3099 */
3100 if (vdev_is_dead(vd) ||
3101 !spa_config_tryenter(spa, SCL_L2ARC, vd, RW_READER))
3102 vd = NULL;
3103 }
3104
3105 mutex_exit(hash_lock);
3106
3107 ASSERT3U(hdr->b_size, ==, size);
3108 DTRACE_PROBE4(arc__miss, arc_buf_hdr_t *, hdr, blkptr_t *, bp,
3109 uint64_t, size, zbookmark_t *, zb);
3110 ARCSTAT_BUMP(arcstat_misses);
3111 ARCSTAT_CONDSTAT(!(hdr->b_flags & ARC_PREFETCH),
3112 demand, prefetch, hdr->b_type != ARC_BUFC_METADATA,
3113 data, metadata, misses);
3114
3115 if (vd != NULL && l2arc_ndev != 0 && !(l2arc_norw && devw)) {
3116 /*
3117 * Read from the L2ARC if the following are true:
3118 * 1. The L2ARC vdev was previously cached.
3119 * 2. This buffer still has L2ARC metadata.
3120 * 3. This buffer isn't currently writing to the L2ARC.
3121 * 4. The L2ARC entry wasn't evicted, which may
3122 * also have invalidated the vdev.
3123 * 5. This isn't prefetch and l2arc_noprefetch is set.
3124 */
3125 if (hdr->b_l2hdr != NULL &&
3126 !HDR_L2_WRITING(hdr) && !HDR_L2_EVICTED(hdr) &&
3127 !(l2arc_noprefetch && HDR_PREFETCH(hdr))) {
3128 l2arc_read_callback_t *cb;
3129
3130 DTRACE_PROBE1(l2arc__hit, arc_buf_hdr_t *, hdr);
3131 ARCSTAT_BUMP(arcstat_l2_hits);
3132
3133 cb = kmem_zalloc(sizeof (l2arc_read_callback_t),
3134 KM_PUSHPAGE);
3135 cb->l2rcb_buf = buf;
3136 cb->l2rcb_spa = spa;
3137 cb->l2rcb_bp = *bp;
3138 cb->l2rcb_zb = *zb;
3139 cb->l2rcb_flags = zio_flags;
3140 cb->l2rcb_compress = hdr->b_l2hdr->b_compress;
3141
3142 /*
3143 * l2arc read. The SCL_L2ARC lock will be
3144 * released by l2arc_read_done().
3145 * Issue a null zio if the underlying buffer
3146 * was squashed to zero size by compression.
3147 */
3148 if (hdr->b_l2hdr->b_compress ==
3149 ZIO_COMPRESS_EMPTY) {
3150 rzio = zio_null(pio, spa, vd,
3151 l2arc_read_done, cb,
3152 zio_flags | ZIO_FLAG_DONT_CACHE |
3153 ZIO_FLAG_CANFAIL |
3154 ZIO_FLAG_DONT_PROPAGATE |
3155 ZIO_FLAG_DONT_RETRY);
3156 } else {
3157 rzio = zio_read_phys(pio, vd, addr,
3158 hdr->b_l2hdr->b_asize,
3159 buf->b_data, ZIO_CHECKSUM_OFF,
3160 l2arc_read_done, cb, priority,
3161 zio_flags | ZIO_FLAG_DONT_CACHE |
3162 ZIO_FLAG_CANFAIL |
3163 ZIO_FLAG_DONT_PROPAGATE |
3164 ZIO_FLAG_DONT_RETRY, B_FALSE);
3165 }
3166 DTRACE_PROBE2(l2arc__read, vdev_t *, vd,
3167 zio_t *, rzio);
3168 ARCSTAT_INCR(arcstat_l2_read_bytes,
3169 hdr->b_l2hdr->b_asize);
3170
3171 if (*arc_flags & ARC_NOWAIT) {
3172 zio_nowait(rzio);
3173 return (0);
3174 }
3175
3176 ASSERT(*arc_flags & ARC_WAIT);
3177 if (zio_wait(rzio) == 0)
3178 return (0);
3179
3180 /* l2arc read error; goto zio_read() */
3181 } else {
3182 DTRACE_PROBE1(l2arc__miss,
3183 arc_buf_hdr_t *, hdr);
3184 ARCSTAT_BUMP(arcstat_l2_misses);
3185 if (HDR_L2_WRITING(hdr))
3186 ARCSTAT_BUMP(arcstat_l2_rw_clash);
3187 spa_config_exit(spa, SCL_L2ARC, vd);
3188 }
3189 } else {
3190 if (vd != NULL)
3191 spa_config_exit(spa, SCL_L2ARC, vd);
3192 if (l2arc_ndev != 0) {
3193 DTRACE_PROBE1(l2arc__miss,
3194 arc_buf_hdr_t *, hdr);
3195 ARCSTAT_BUMP(arcstat_l2_misses);
3196 }
3197 }
3198
3199 rzio = zio_read(pio, spa, bp, buf->b_data, size,
3200 arc_read_done, buf, priority, zio_flags, zb);
3201
3202 if (*arc_flags & ARC_WAIT)
3203 return (zio_wait(rzio));
3204
3205 ASSERT(*arc_flags & ARC_NOWAIT);
3206 zio_nowait(rzio);
3207 }
3208 return (0);
3209 }
3210
3211 arc_prune_t *
3212 arc_add_prune_callback(arc_prune_func_t *func, void *private)
3213 {
3214 arc_prune_t *p;
3215
3216 p = kmem_alloc(sizeof(*p), KM_SLEEP);
3217 p->p_pfunc = func;
3218 p->p_private = private;
3219 list_link_init(&p->p_node);
3220 refcount_create(&p->p_refcnt);
3221
3222 mutex_enter(&arc_prune_mtx);
3223 refcount_add(&p->p_refcnt, &arc_prune_list);
3224 list_insert_head(&arc_prune_list, p);
3225 mutex_exit(&arc_prune_mtx);
3226
3227 return (p);
3228 }
3229
3230 void
3231 arc_remove_prune_callback(arc_prune_t *p)
3232 {
3233 mutex_enter(&arc_prune_mtx);
3234 list_remove(&arc_prune_list, p);
3235 if (refcount_remove(&p->p_refcnt, &arc_prune_list) == 0) {
3236 refcount_destroy(&p->p_refcnt);
3237 kmem_free(p, sizeof (*p));
3238 }
3239 mutex_exit(&arc_prune_mtx);
3240 }
3241
3242 void
3243 arc_set_callback(arc_buf_t *buf, arc_evict_func_t *func, void *private)
3244 {
3245 ASSERT(buf->b_hdr != NULL);
3246 ASSERT(buf->b_hdr->b_state != arc_anon);
3247 ASSERT(!refcount_is_zero(&buf->b_hdr->b_refcnt) || func == NULL);
3248 ASSERT(buf->b_efunc == NULL);
3249 ASSERT(!HDR_BUF_AVAILABLE(buf->b_hdr));
3250
3251 buf->b_efunc = func;
3252 buf->b_private = private;
3253 }
3254
3255 /*
3256 * Notify the arc that a block was freed, and thus will never be used again.
3257 */
3258 void
3259 arc_freed(spa_t *spa, const blkptr_t *bp)
3260 {
3261 arc_buf_hdr_t *hdr;
3262 kmutex_t *hash_lock;
3263 uint64_t guid = spa_load_guid(spa);
3264
3265 hdr = buf_hash_find(guid, BP_IDENTITY(bp), BP_PHYSICAL_BIRTH(bp),
3266 &hash_lock);
3267 if (hdr == NULL)
3268 return;
3269 if (HDR_BUF_AVAILABLE(hdr)) {
3270 arc_buf_t *buf = hdr->b_buf;
3271 add_reference(hdr, hash_lock, FTAG);
3272 hdr->b_flags &= ~ARC_BUF_AVAILABLE;
3273 mutex_exit(hash_lock);
3274
3275 arc_release(buf, FTAG);
3276 (void) arc_buf_remove_ref(buf, FTAG);
3277 } else {
3278 mutex_exit(hash_lock);
3279 }
3280
3281 }
3282
3283 /*
3284 * This is used by the DMU to let the ARC know that a buffer is
3285 * being evicted, so the ARC should clean up. If this arc buf
3286 * is not yet in the evicted state, it will be put there.
3287 */
3288 int
3289 arc_buf_evict(arc_buf_t *buf)
3290 {
3291 arc_buf_hdr_t *hdr;
3292 kmutex_t *hash_lock;
3293 arc_buf_t **bufp;
3294
3295 mutex_enter(&buf->b_evict_lock);
3296 hdr = buf->b_hdr;
3297 if (hdr == NULL) {
3298 /*
3299 * We are in arc_do_user_evicts().
3300 */
3301 ASSERT(buf->b_data == NULL);
3302 mutex_exit(&buf->b_evict_lock);
3303 return (0);
3304 } else if (buf->b_data == NULL) {
3305 arc_buf_t copy = *buf; /* structure assignment */
3306 /*
3307 * We are on the eviction list; process this buffer now
3308 * but let arc_do_user_evicts() do the reaping.
3309 */
3310 buf->b_efunc = NULL;
3311 mutex_exit(&buf->b_evict_lock);
3312 VERIFY(copy.b_efunc(&copy) == 0);
3313 return (1);
3314 }
3315 hash_lock = HDR_LOCK(hdr);
3316 mutex_enter(hash_lock);
3317 hdr = buf->b_hdr;
3318 ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
3319
3320 ASSERT3U(refcount_count(&hdr->b_refcnt), <, hdr->b_datacnt);
3321 ASSERT(hdr->b_state == arc_mru || hdr->b_state == arc_mfu);
3322
3323 /*
3324 * Pull this buffer off of the hdr
3325 */
3326 bufp = &hdr->b_buf;
3327 while (*bufp != buf)
3328 bufp = &(*bufp)->b_next;
3329 *bufp = buf->b_next;
3330
3331 ASSERT(buf->b_data != NULL);
3332 arc_buf_destroy(buf, FALSE, FALSE);
3333
3334 if (hdr->b_datacnt == 0) {
3335 arc_state_t *old_state = hdr->b_state;
3336 arc_state_t *evicted_state;
3337
3338 ASSERT(hdr->b_buf == NULL);
3339 ASSERT(refcount_is_zero(&hdr->b_refcnt));
3340
3341 evicted_state =
3342 (old_state == arc_mru) ? arc_mru_ghost : arc_mfu_ghost;
3343
3344 mutex_enter(&old_state->arcs_mtx);
3345 mutex_enter(&evicted_state->arcs_mtx);
3346
3347 arc_change_state(evicted_state, hdr, hash_lock);
3348 ASSERT(HDR_IN_HASH_TABLE(hdr));
3349 hdr->b_flags |= ARC_IN_HASH_TABLE;
3350 hdr->b_flags &= ~ARC_BUF_AVAILABLE;
3351
3352 mutex_exit(&evicted_state->arcs_mtx);
3353 mutex_exit(&old_state->arcs_mtx);
3354 }
3355 mutex_exit(hash_lock);
3356 mutex_exit(&buf->b_evict_lock);
3357
3358 VERIFY(buf->b_efunc(buf) == 0);
3359 buf->b_efunc = NULL;
3360 buf->b_private = NULL;
3361 buf->b_hdr = NULL;
3362 buf->b_next = NULL;
3363 kmem_cache_free(buf_cache, buf);
3364 return (1);
3365 }
3366
3367 /*
3368 * Release this buffer from the cache. This must be done
3369 * after a read and prior to modifying the buffer contents.
3370 * If the buffer has more than one reference, we must make
3371 * a new hdr for the buffer.
3372 */
3373 void
3374 arc_release(arc_buf_t *buf, void *tag)
3375 {
3376 arc_buf_hdr_t *hdr;
3377 kmutex_t *hash_lock = NULL;
3378 l2arc_buf_hdr_t *l2hdr;
3379 uint64_t buf_size = 0;
3380
3381 /*
3382 * It would be nice to assert that if it's DMU metadata (level >
3383 * 0 || it's the dnode file), then it must be syncing context.
3384 * But we don't know that information at this level.
3385 */
3386
3387 mutex_enter(&buf->b_evict_lock);
3388 hdr = buf->b_hdr;
3389
3390 /* this buffer is not on any list */
3391 ASSERT(refcount_count(&hdr->b_refcnt) > 0);
3392
3393 if (hdr->b_state == arc_anon) {
3394 /* this buffer is already released */
3395 ASSERT(buf->b_efunc == NULL);
3396 } else {
3397 hash_lock = HDR_LOCK(hdr);
3398 mutex_enter(hash_lock);
3399 hdr = buf->b_hdr;
3400 ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
3401 }
3402
3403 l2hdr = hdr->b_l2hdr;
3404 if (l2hdr) {
3405 mutex_enter(&l2arc_buflist_mtx);
3406 hdr->b_l2hdr = NULL;
3407 buf_size = hdr->b_size;
3408 }
3409
3410 /*
3411 * Do we have more than one buf?
3412 */
3413 if (hdr->b_datacnt > 1) {
3414 arc_buf_hdr_t *nhdr;
3415 arc_buf_t **bufp;
3416 uint64_t blksz = hdr->b_size;
3417 uint64_t spa = hdr->b_spa;
3418 arc_buf_contents_t type = hdr->b_type;
3419 uint32_t flags = hdr->b_flags;
3420
3421 ASSERT(hdr->b_buf != buf || buf->b_next != NULL);
3422 /*
3423 * Pull the data off of this hdr and attach it to
3424 * a new anonymous hdr.
3425 */
3426 (void) remove_reference(hdr, hash_lock, tag);
3427 bufp = &hdr->b_buf;
3428 while (*bufp != buf)
3429 bufp = &(*bufp)->b_next;
3430 *bufp = buf->b_next;
3431 buf->b_next = NULL;
3432
3433 ASSERT3U(hdr->b_state->arcs_size, >=, hdr->b_size);
3434 atomic_add_64(&hdr->b_state->arcs_size, -hdr->b_size);
3435 if (refcount_is_zero(&hdr->b_refcnt)) {
3436 uint64_t *size = &hdr->b_state->arcs_lsize[hdr->b_type];
3437 ASSERT3U(*size, >=, hdr->b_size);
3438 atomic_add_64(size, -hdr->b_size);
3439 }
3440
3441 /*
3442 * We're releasing a duplicate user data buffer, update
3443 * our statistics accordingly.
3444 */
3445 if (hdr->b_type == ARC_BUFC_DATA) {
3446 ARCSTAT_BUMPDOWN(arcstat_duplicate_buffers);
3447 ARCSTAT_INCR(arcstat_duplicate_buffers_size,
3448 -hdr->b_size);
3449 }
3450 hdr->b_datacnt -= 1;
3451 arc_cksum_verify(buf);
3452
3453 mutex_exit(hash_lock);
3454
3455 nhdr = kmem_cache_alloc(hdr_cache, KM_PUSHPAGE);
3456 nhdr->b_size = blksz;
3457 nhdr->b_spa = spa;
3458 nhdr->b_type = type;
3459 nhdr->b_buf = buf;
3460 nhdr->b_state = arc_anon;
3461 nhdr->b_arc_access = 0;
3462 nhdr->b_flags = flags & ARC_L2_WRITING;
3463 nhdr->b_l2hdr = NULL;
3464 nhdr->b_datacnt = 1;
3465 nhdr->b_freeze_cksum = NULL;
3466 (void) refcount_add(&nhdr->b_refcnt, tag);
3467 buf->b_hdr = nhdr;
3468 mutex_exit(&buf->b_evict_lock);
3469 atomic_add_64(&arc_anon->arcs_size, blksz);
3470 } else {
3471 mutex_exit(&buf->b_evict_lock);
3472 ASSERT(refcount_count(&hdr->b_refcnt) == 1);
3473 ASSERT(!list_link_active(&hdr->b_arc_node));
3474 ASSERT(!HDR_IO_IN_PROGRESS(hdr));
3475 if (hdr->b_state != arc_anon)
3476 arc_change_state(arc_anon, hdr, hash_lock);
3477 hdr->b_arc_access = 0;
3478 if (hash_lock)
3479 mutex_exit(hash_lock);
3480
3481 buf_discard_identity(hdr);
3482 arc_buf_thaw(buf);
3483 }
3484 buf->b_efunc = NULL;
3485 buf->b_private = NULL;
3486
3487 if (l2hdr) {
3488 ARCSTAT_INCR(arcstat_l2_asize, -l2hdr->b_asize);
3489 list_remove(l2hdr->b_dev->l2ad_buflist, hdr);
3490 kmem_free(l2hdr, sizeof (l2arc_buf_hdr_t));
3491 arc_space_return(L2HDR_SIZE, ARC_SPACE_L2HDRS);
3492 ARCSTAT_INCR(arcstat_l2_size, -buf_size);
3493 mutex_exit(&l2arc_buflist_mtx);
3494 }
3495 }
3496
3497 int
3498 arc_released(arc_buf_t *buf)
3499 {
3500 int released;
3501
3502 mutex_enter(&buf->b_evict_lock);
3503 released = (buf->b_data != NULL && buf->b_hdr->b_state == arc_anon);
3504 mutex_exit(&buf->b_evict_lock);
3505 return (released);
3506 }
3507
3508 int
3509 arc_has_callback(arc_buf_t *buf)
3510 {
3511 int callback;
3512
3513 mutex_enter(&buf->b_evict_lock);
3514 callback = (buf->b_efunc != NULL);
3515 mutex_exit(&buf->b_evict_lock);
3516 return (callback);
3517 }
3518
3519 #ifdef ZFS_DEBUG
3520 int
3521 arc_referenced(arc_buf_t *buf)
3522 {
3523 int referenced;
3524
3525 mutex_enter(&buf->b_evict_lock);
3526 referenced = (refcount_count(&buf->b_hdr->b_refcnt));
3527 mutex_exit(&buf->b_evict_lock);
3528 return (referenced);
3529 }
3530 #endif
3531
3532 static void
3533 arc_write_ready(zio_t *zio)
3534 {
3535 arc_write_callback_t *callback = zio->io_private;
3536 arc_buf_t *buf = callback->awcb_buf;
3537 arc_buf_hdr_t *hdr = buf->b_hdr;
3538
3539 ASSERT(!refcount_is_zero(&buf->b_hdr->b_refcnt));
3540 callback->awcb_ready(zio, buf, callback->awcb_private);
3541
3542 /*
3543 * If the IO is already in progress, then this is a re-write
3544 * attempt, so we need to thaw and re-compute the cksum.
3545 * It is the responsibility of the callback to handle the
3546 * accounting for any re-write attempt.
3547 */
3548 if (HDR_IO_IN_PROGRESS(hdr)) {
3549 mutex_enter(&hdr->b_freeze_lock);
3550 if (hdr->b_freeze_cksum != NULL) {
3551 kmem_free(hdr->b_freeze_cksum, sizeof (zio_cksum_t));
3552 hdr->b_freeze_cksum = NULL;
3553 }
3554 mutex_exit(&hdr->b_freeze_lock);
3555 }
3556 arc_cksum_compute(buf, B_FALSE);
3557 hdr->b_flags |= ARC_IO_IN_PROGRESS;
3558 }
3559
3560 static void
3561 arc_write_done(zio_t *zio)
3562 {
3563 arc_write_callback_t *callback = zio->io_private;
3564 arc_buf_t *buf = callback->awcb_buf;
3565 arc_buf_hdr_t *hdr = buf->b_hdr;
3566
3567 ASSERT(hdr->b_acb == NULL);
3568
3569 if (zio->io_error == 0) {
3570 hdr->b_dva = *BP_IDENTITY(zio->io_bp);
3571 hdr->b_birth = BP_PHYSICAL_BIRTH(zio->io_bp);
3572 hdr->b_cksum0 = zio->io_bp->blk_cksum.zc_word[0];
3573 } else {
3574 ASSERT(BUF_EMPTY(hdr));
3575 }
3576
3577 /*
3578 * If the block to be written was all-zero, we may have
3579 * compressed it away. In this case no write was performed
3580 * so there will be no dva/birth/checksum. The buffer must
3581 * therefore remain anonymous (and uncached).
3582 */
3583 if (!BUF_EMPTY(hdr)) {
3584 arc_buf_hdr_t *exists;
3585 kmutex_t *hash_lock;
3586
3587 ASSERT(zio->io_error == 0);
3588
3589 arc_cksum_verify(buf);
3590
3591 exists = buf_hash_insert(hdr, &hash_lock);
3592 if (exists) {
3593 /*
3594 * This can only happen if we overwrite for
3595 * sync-to-convergence, because we remove
3596 * buffers from the hash table when we arc_free().
3597 */
3598 if (zio->io_flags & ZIO_FLAG_IO_REWRITE) {
3599 if (!BP_EQUAL(&zio->io_bp_orig, zio->io_bp))
3600 panic("bad overwrite, hdr=%p exists=%p",
3601 (void *)hdr, (void *)exists);
3602 ASSERT(refcount_is_zero(&exists->b_refcnt));
3603 arc_change_state(arc_anon, exists, hash_lock);
3604 mutex_exit(hash_lock);
3605 arc_hdr_destroy(exists);
3606 exists = buf_hash_insert(hdr, &hash_lock);
3607 ASSERT3P(exists, ==, NULL);
3608 } else {
3609 /* Dedup */
3610 ASSERT(hdr->b_datacnt == 1);
3611 ASSERT(hdr->b_state == arc_anon);
3612 ASSERT(BP_GET_DEDUP(zio->io_bp));
3613 ASSERT(BP_GET_LEVEL(zio->io_bp) == 0);
3614 }
3615 }
3616 hdr->b_flags &= ~ARC_IO_IN_PROGRESS;
3617 /* if it's not anon, we are doing a scrub */
3618 if (!exists && hdr->b_state == arc_anon)
3619 arc_access(hdr, hash_lock);
3620 mutex_exit(hash_lock);
3621 } else {
3622 hdr->b_flags &= ~ARC_IO_IN_PROGRESS;
3623 }
3624
3625 ASSERT(!refcount_is_zero(&hdr->b_refcnt));
3626 callback->awcb_done(zio, buf, callback->awcb_private);
3627
3628 kmem_free(callback, sizeof (arc_write_callback_t));
3629 }
3630
3631 zio_t *
3632 arc_write(zio_t *pio, spa_t *spa, uint64_t txg,
3633 blkptr_t *bp, arc_buf_t *buf, boolean_t l2arc, boolean_t l2arc_compress,
3634 const zio_prop_t *zp, arc_done_func_t *ready, arc_done_func_t *done,
3635 void *private, int priority, int zio_flags, const zbookmark_t *zb)
3636 {
3637 arc_buf_hdr_t *hdr = buf->b_hdr;
3638 arc_write_callback_t *callback;
3639 zio_t *zio;
3640
3641 ASSERT(ready != NULL);
3642 ASSERT(done != NULL);
3643 ASSERT(!HDR_IO_ERROR(hdr));
3644 ASSERT((hdr->b_flags & ARC_IO_IN_PROGRESS) == 0);
3645 ASSERT(hdr->b_acb == NULL);
3646 if (l2arc)
3647 hdr->b_flags |= ARC_L2CACHE;
3648 if (l2arc_compress)
3649 hdr->b_flags |= ARC_L2COMPRESS;
3650 callback = kmem_zalloc(sizeof (arc_write_callback_t), KM_PUSHPAGE);
3651 callback->awcb_ready = ready;
3652 callback->awcb_done = done;
3653 callback->awcb_private = private;
3654 callback->awcb_buf = buf;
3655
3656 zio = zio_write(pio, spa, txg, bp, buf->b_data, hdr->b_size, zp,
3657 arc_write_ready, arc_write_done, callback, priority, zio_flags, zb);
3658
3659 return (zio);
3660 }
3661
3662 static int
3663 arc_memory_throttle(uint64_t reserve, uint64_t inflight_data, uint64_t txg)
3664 {
3665 #ifdef _KERNEL
3666 uint64_t available_memory;
3667
3668 if (zfs_arc_memory_throttle_disable)
3669 return (0);
3670
3671 /* Easily reclaimable memory (free + inactive + arc-evictable) */
3672 available_memory = ptob(spl_kmem_availrmem()) + arc_evictable_memory();
3673
3674 if (available_memory <= zfs_write_limit_max) {
3675 ARCSTAT_INCR(arcstat_memory_throttle_count, 1);
3676 DMU_TX_STAT_BUMP(dmu_tx_memory_reclaim);
3677 return (EAGAIN);
3678 }
3679
3680 if (inflight_data > available_memory / 4) {
3681 ARCSTAT_INCR(arcstat_memory_throttle_count, 1);
3682 DMU_TX_STAT_BUMP(dmu_tx_memory_inflight);
3683 return (ERESTART);
3684 }
3685 #endif
3686 return (0);
3687 }
3688
3689 void
3690 arc_tempreserve_clear(uint64_t reserve)
3691 {
3692 atomic_add_64(&arc_tempreserve, -reserve);
3693 ASSERT((int64_t)arc_tempreserve >= 0);
3694 }
3695
3696 int
3697 arc_tempreserve_space(uint64_t reserve, uint64_t txg)
3698 {
3699 int error;
3700 uint64_t anon_size;
3701
3702 #ifdef ZFS_DEBUG
3703 /*
3704 * Once in a while, fail for no reason. Everything should cope.
3705 */
3706 if (spa_get_random(10000) == 0) {
3707 dprintf("forcing random failure\n");
3708 return (ERESTART);
3709 }
3710 #endif
3711 if (reserve > arc_c/4 && !arc_no_grow)
3712 arc_c = MIN(arc_c_max, reserve * 4);
3713 if (reserve > arc_c) {
3714 DMU_TX_STAT_BUMP(dmu_tx_memory_reserve);
3715 return (ENOMEM);
3716 }
3717
3718 /*
3719 * Don't count loaned bufs as in flight dirty data to prevent long
3720 * network delays from blocking transactions that are ready to be
3721 * assigned to a txg.
3722 */
3723 anon_size = MAX((int64_t)(arc_anon->arcs_size - arc_loaned_bytes), 0);
3724
3725 /*
3726 * Writes will, almost always, require additional memory allocations
3727 * in order to compress/encrypt/etc the data. We therefor need to
3728 * make sure that there is sufficient available memory for this.
3729 */
3730 if ((error = arc_memory_throttle(reserve, anon_size, txg)))
3731 return (error);
3732
3733 /*
3734 * Throttle writes when the amount of dirty data in the cache
3735 * gets too large. We try to keep the cache less than half full
3736 * of dirty blocks so that our sync times don't grow too large.
3737 * Note: if two requests come in concurrently, we might let them
3738 * both succeed, when one of them should fail. Not a huge deal.
3739 */
3740
3741 if (reserve + arc_tempreserve + anon_size > arc_c / 2 &&
3742 anon_size > arc_c / 4) {
3743 dprintf("failing, arc_tempreserve=%lluK anon_meta=%lluK "
3744 "anon_data=%lluK tempreserve=%lluK arc_c=%lluK\n",
3745 arc_tempreserve>>10,
3746 arc_anon->arcs_lsize[ARC_BUFC_METADATA]>>10,
3747 arc_anon->arcs_lsize[ARC_BUFC_DATA]>>10,
3748 reserve>>10, arc_c>>10);
3749 DMU_TX_STAT_BUMP(dmu_tx_dirty_throttle);
3750 return (ERESTART);
3751 }
3752 atomic_add_64(&arc_tempreserve, reserve);
3753 return (0);
3754 }
3755
3756 static void
3757 arc_kstat_update_state(arc_state_t *state, kstat_named_t *size,
3758 kstat_named_t *evict_data, kstat_named_t *evict_metadata)
3759 {
3760 size->value.ui64 = state->arcs_size;
3761 evict_data->value.ui64 = state->arcs_lsize[ARC_BUFC_DATA];
3762 evict_metadata->value.ui64 = state->arcs_lsize[ARC_BUFC_METADATA];
3763 }
3764
3765 static int
3766 arc_kstat_update(kstat_t *ksp, int rw)
3767 {
3768 arc_stats_t *as = ksp->ks_data;
3769
3770 if (rw == KSTAT_WRITE) {
3771 return (EACCES);
3772 } else {
3773 arc_kstat_update_state(arc_anon,
3774 &as->arcstat_anon_size,
3775 &as->arcstat_anon_evict_data,
3776 &as->arcstat_anon_evict_metadata);
3777 arc_kstat_update_state(arc_mru,
3778 &as->arcstat_mru_size,
3779 &as->arcstat_mru_evict_data,
3780 &as->arcstat_mru_evict_metadata);
3781 arc_kstat_update_state(arc_mru_ghost,
3782 &as->arcstat_mru_ghost_size,
3783 &as->arcstat_mru_ghost_evict_data,
3784 &as->arcstat_mru_ghost_evict_metadata);
3785 arc_kstat_update_state(arc_mfu,
3786 &as->arcstat_mfu_size,
3787 &as->arcstat_mfu_evict_data,
3788 &as->arcstat_mfu_evict_metadata);
3789 arc_kstat_update_state(arc_mfu_ghost,
3790 &as->arcstat_mfu_ghost_size,
3791 &as->arcstat_mfu_ghost_evict_data,
3792 &as->arcstat_mfu_ghost_evict_metadata);
3793 }
3794
3795 return (0);
3796 }
3797
3798 void
3799 arc_init(void)
3800 {
3801 mutex_init(&arc_reclaim_thr_lock, NULL, MUTEX_DEFAULT, NULL);
3802 cv_init(&arc_reclaim_thr_cv, NULL, CV_DEFAULT, NULL);
3803
3804 /* Convert seconds to clock ticks */
3805 zfs_arc_min_prefetch_lifespan = 1 * hz;
3806
3807 /* Start out with 1/8 of all memory */
3808 arc_c = physmem * PAGESIZE / 8;
3809
3810 #ifdef _KERNEL
3811 /*
3812 * On architectures where the physical memory can be larger
3813 * than the addressable space (intel in 32-bit mode), we may
3814 * need to limit the cache to 1/8 of VM size.
3815 */
3816 arc_c = MIN(arc_c, vmem_size(heap_arena, VMEM_ALLOC | VMEM_FREE) / 8);
3817 /*
3818 * Register a shrinker to support synchronous (direct) memory
3819 * reclaim from the arc. This is done to prevent kswapd from
3820 * swapping out pages when it is preferable to shrink the arc.
3821 */
3822 spl_register_shrinker(&arc_shrinker);
3823 #endif
3824
3825 /* set min cache to 1/32 of all memory, or 64MB, whichever is more */
3826 arc_c_min = MAX(arc_c / 4, 64<<20);
3827 /* set max to 1/2 of all memory */
3828 arc_c_max = MAX(arc_c * 4, arc_c_max);
3829
3830 /*
3831 * Allow the tunables to override our calculations if they are
3832 * reasonable (ie. over 64MB)
3833 */
3834 if (zfs_arc_max > 64<<20 && zfs_arc_max < physmem * PAGESIZE)
3835 arc_c_max = zfs_arc_max;
3836 if (zfs_arc_min > 64<<20 && zfs_arc_min <= arc_c_max)
3837 arc_c_min = zfs_arc_min;
3838
3839 arc_c = arc_c_max;
3840 arc_p = (arc_c >> 1);
3841
3842 /* limit meta-data to 1/4 of the arc capacity */
3843 arc_meta_limit = arc_c_max / 4;
3844 arc_meta_max = 0;
3845
3846 /* Allow the tunable to override if it is reasonable */
3847 if (zfs_arc_meta_limit > 0 && zfs_arc_meta_limit <= arc_c_max)
3848 arc_meta_limit = zfs_arc_meta_limit;
3849
3850 if (arc_c_min < arc_meta_limit / 2 && zfs_arc_min == 0)
3851 arc_c_min = arc_meta_limit / 2;
3852
3853 /* if kmem_flags are set, lets try to use less memory */
3854 if (kmem_debugging())
3855 arc_c = arc_c / 2;
3856 if (arc_c < arc_c_min)
3857 arc_c = arc_c_min;
3858
3859 arc_anon = &ARC_anon;
3860 arc_mru = &ARC_mru;
3861 arc_mru_ghost = &ARC_mru_ghost;
3862 arc_mfu = &ARC_mfu;
3863 arc_mfu_ghost = &ARC_mfu_ghost;
3864 arc_l2c_only = &ARC_l2c_only;
3865 arc_size = 0;
3866
3867 mutex_init(&arc_anon->arcs_mtx, NULL, MUTEX_DEFAULT, NULL);
3868 mutex_init(&arc_mru->arcs_mtx, NULL, MUTEX_DEFAULT, NULL);
3869 mutex_init(&arc_mru_ghost->arcs_mtx, NULL, MUTEX_DEFAULT, NULL);
3870 mutex_init(&arc_mfu->arcs_mtx, NULL, MUTEX_DEFAULT, NULL);
3871 mutex_init(&arc_mfu_ghost->arcs_mtx, NULL, MUTEX_DEFAULT, NULL);
3872 mutex_init(&arc_l2c_only->arcs_mtx, NULL, MUTEX_DEFAULT, NULL);
3873
3874 list_create(&arc_mru->arcs_list[ARC_BUFC_METADATA],
3875 sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
3876 list_create(&arc_mru->arcs_list[ARC_BUFC_DATA],
3877 sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
3878 list_create(&arc_mru_ghost->arcs_list[ARC_BUFC_METADATA],
3879 sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
3880 list_create(&arc_mru_ghost->arcs_list[ARC_BUFC_DATA],
3881 sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
3882 list_create(&arc_mfu->arcs_list[ARC_BUFC_METADATA],
3883 sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
3884 list_create(&arc_mfu->arcs_list[ARC_BUFC_DATA],
3885 sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
3886 list_create(&arc_mfu_ghost->arcs_list[ARC_BUFC_METADATA],
3887 sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
3888 list_create(&arc_mfu_ghost->arcs_list[ARC_BUFC_DATA],
3889 sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
3890 list_create(&arc_l2c_only->arcs_list[ARC_BUFC_METADATA],
3891 sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
3892 list_create(&arc_l2c_only->arcs_list[ARC_BUFC_DATA],
3893 sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
3894
3895 buf_init();
3896
3897 arc_thread_exit = 0;
3898 list_create(&arc_prune_list, sizeof (arc_prune_t),
3899 offsetof(arc_prune_t, p_node));
3900 arc_eviction_list = NULL;
3901 mutex_init(&arc_prune_mtx, NULL, MUTEX_DEFAULT, NULL);
3902 mutex_init(&arc_eviction_mtx, NULL, MUTEX_DEFAULT, NULL);
3903 bzero(&arc_eviction_hdr, sizeof (arc_buf_hdr_t));
3904
3905 arc_ksp = kstat_create("zfs", 0, "arcstats", "misc", KSTAT_TYPE_NAMED,
3906 sizeof (arc_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL);
3907
3908 if (arc_ksp != NULL) {
3909 arc_ksp->ks_data = &arc_stats;
3910 arc_ksp->ks_update = arc_kstat_update;
3911 kstat_install(arc_ksp);
3912 }
3913
3914 (void) thread_create(NULL, 0, arc_adapt_thread, NULL, 0, &p0,
3915 TS_RUN, minclsyspri);
3916
3917 arc_dead = FALSE;
3918 arc_warm = B_FALSE;
3919
3920 if (zfs_write_limit_max == 0)
3921 zfs_write_limit_max = ptob(physmem) >> zfs_write_limit_shift;
3922 else
3923 zfs_write_limit_shift = 0;
3924 mutex_init(&zfs_write_limit_lock, NULL, MUTEX_DEFAULT, NULL);
3925 }
3926
3927 void
3928 arc_fini(void)
3929 {
3930 arc_prune_t *p;
3931
3932 mutex_enter(&arc_reclaim_thr_lock);
3933 #ifdef _KERNEL
3934 spl_unregister_shrinker(&arc_shrinker);
3935 #endif /* _KERNEL */
3936
3937 arc_thread_exit = 1;
3938 while (arc_thread_exit != 0)
3939 cv_wait(&arc_reclaim_thr_cv, &arc_reclaim_thr_lock);
3940 mutex_exit(&arc_reclaim_thr_lock);
3941
3942 arc_flush(NULL);
3943
3944 arc_dead = TRUE;
3945
3946 if (arc_ksp != NULL) {
3947 kstat_delete(arc_ksp);
3948 arc_ksp = NULL;
3949 }
3950
3951 mutex_enter(&arc_prune_mtx);
3952 while ((p = list_head(&arc_prune_list)) != NULL) {
3953 list_remove(&arc_prune_list, p);
3954 refcount_remove(&p->p_refcnt, &arc_prune_list);
3955 refcount_destroy(&p->p_refcnt);
3956 kmem_free(p, sizeof (*p));
3957 }
3958 mutex_exit(&arc_prune_mtx);
3959
3960 list_destroy(&arc_prune_list);
3961 mutex_destroy(&arc_prune_mtx);
3962 mutex_destroy(&arc_eviction_mtx);
3963 mutex_destroy(&arc_reclaim_thr_lock);
3964 cv_destroy(&arc_reclaim_thr_cv);
3965
3966 list_destroy(&arc_mru->arcs_list[ARC_BUFC_METADATA]);
3967 list_destroy(&arc_mru_ghost->arcs_list[ARC_BUFC_METADATA]);
3968 list_destroy(&arc_mfu->arcs_list[ARC_BUFC_METADATA]);
3969 list_destroy(&arc_mfu_ghost->arcs_list[ARC_BUFC_METADATA]);
3970 list_destroy(&arc_mru->arcs_list[ARC_BUFC_DATA]);
3971 list_destroy(&arc_mru_ghost->arcs_list[ARC_BUFC_DATA]);
3972 list_destroy(&arc_mfu->arcs_list[ARC_BUFC_DATA]);
3973 list_destroy(&arc_mfu_ghost->arcs_list[ARC_BUFC_DATA]);
3974
3975 mutex_destroy(&arc_anon->arcs_mtx);
3976 mutex_destroy(&arc_mru->arcs_mtx);
3977 mutex_destroy(&arc_mru_ghost->arcs_mtx);
3978 mutex_destroy(&arc_mfu->arcs_mtx);
3979 mutex_destroy(&arc_mfu_ghost->arcs_mtx);
3980 mutex_destroy(&arc_l2c_only->arcs_mtx);
3981
3982 mutex_destroy(&zfs_write_limit_lock);
3983
3984 buf_fini();
3985
3986 ASSERT(arc_loaned_bytes == 0);
3987 }
3988
3989 /*
3990 * Level 2 ARC
3991 *
3992 * The level 2 ARC (L2ARC) is a cache layer in-between main memory and disk.
3993 * It uses dedicated storage devices to hold cached data, which are populated
3994 * using large infrequent writes. The main role of this cache is to boost
3995 * the performance of random read workloads. The intended L2ARC devices
3996 * include short-stroked disks, solid state disks, and other media with
3997 * substantially faster read latency than disk.
3998 *
3999 * +-----------------------+
4000 * | ARC |
4001 * +-----------------------+
4002 * | ^ ^
4003 * | | |
4004 * l2arc_feed_thread() arc_read()
4005 * | | |
4006 * | l2arc read |
4007 * V | |
4008 * +---------------+ |
4009 * | L2ARC | |
4010 * +---------------+ |
4011 * | ^ |
4012 * l2arc_write() | |
4013 * | | |
4014 * V | |
4015 * +-------+ +-------+
4016 * | vdev | | vdev |
4017 * | cache | | cache |
4018 * +-------+ +-------+
4019 * +=========+ .-----.
4020 * : L2ARC : |-_____-|
4021 * : devices : | Disks |
4022 * +=========+ `-_____-'
4023 *
4024 * Read requests are satisfied from the following sources, in order:
4025 *
4026 * 1) ARC
4027 * 2) vdev cache of L2ARC devices
4028 * 3) L2ARC devices
4029 * 4) vdev cache of disks
4030 * 5) disks
4031 *
4032 * Some L2ARC device types exhibit extremely slow write performance.
4033 * To accommodate for this there are some significant differences between
4034 * the L2ARC and traditional cache design:
4035 *
4036 * 1. There is no eviction path from the ARC to the L2ARC. Evictions from
4037 * the ARC behave as usual, freeing buffers and placing headers on ghost
4038 * lists. The ARC does not send buffers to the L2ARC during eviction as
4039 * this would add inflated write latencies for all ARC memory pressure.
4040 *
4041 * 2. The L2ARC attempts to cache data from the ARC before it is evicted.
4042 * It does this by periodically scanning buffers from the eviction-end of
4043 * the MFU and MRU ARC lists, copying them to the L2ARC devices if they are
4044 * not already there. It scans until a headroom of buffers is satisfied,
4045 * which itself is a buffer for ARC eviction. If a compressible buffer is
4046 * found during scanning and selected for writing to an L2ARC device, we
4047 * temporarily boost scanning headroom during the next scan cycle to make
4048 * sure we adapt to compression effects (which might significantly reduce
4049 * the data volume we write to L2ARC). The thread that does this is
4050 * l2arc_feed_thread(), illustrated below; example sizes are included to
4051 * provide a better sense of ratio than this diagram:
4052 *
4053 * head --> tail
4054 * +---------------------+----------+
4055 * ARC_mfu |:::::#:::::::::::::::|o#o###o###|-->. # already on L2ARC
4056 * +---------------------+----------+ | o L2ARC eligible
4057 * ARC_mru |:#:::::::::::::::::::|#o#ooo####|-->| : ARC buffer
4058 * +---------------------+----------+ |
4059 * 15.9 Gbytes ^ 32 Mbytes |
4060 * headroom |
4061 * l2arc_feed_thread()
4062 * |
4063 * l2arc write hand <--[oooo]--'
4064 * | 8 Mbyte
4065 * | write max
4066 * V
4067 * +==============================+
4068 * L2ARC dev |####|#|###|###| |####| ... |
4069 * +==============================+
4070 * 32 Gbytes
4071 *
4072 * 3. If an ARC buffer is copied to the L2ARC but then hit instead of
4073 * evicted, then the L2ARC has cached a buffer much sooner than it probably
4074 * needed to, potentially wasting L2ARC device bandwidth and storage. It is
4075 * safe to say that this is an uncommon case, since buffers at the end of
4076 * the ARC lists have moved there due to inactivity.
4077 *
4078 * 4. If the ARC evicts faster than the L2ARC can maintain a headroom,
4079 * then the L2ARC simply misses copying some buffers. This serves as a
4080 * pressure valve to prevent heavy read workloads from both stalling the ARC
4081 * with waits and clogging the L2ARC with writes. This also helps prevent
4082 * the potential for the L2ARC to churn if it attempts to cache content too
4083 * quickly, such as during backups of the entire pool.
4084 *
4085 * 5. After system boot and before the ARC has filled main memory, there are
4086 * no evictions from the ARC and so the tails of the ARC_mfu and ARC_mru
4087 * lists can remain mostly static. Instead of searching from tail of these
4088 * lists as pictured, the l2arc_feed_thread() will search from the list heads
4089 * for eligible buffers, greatly increasing its chance of finding them.
4090 *
4091 * The L2ARC device write speed is also boosted during this time so that
4092 * the L2ARC warms up faster. Since there have been no ARC evictions yet,
4093 * there are no L2ARC reads, and no fear of degrading read performance
4094 * through increased writes.
4095 *
4096 * 6. Writes to the L2ARC devices are grouped and sent in-sequence, so that
4097 * the vdev queue can aggregate them into larger and fewer writes. Each
4098 * device is written to in a rotor fashion, sweeping writes through
4099 * available space then repeating.
4100 *
4101 * 7. The L2ARC does not store dirty content. It never needs to flush
4102 * write buffers back to disk based storage.
4103 *
4104 * 8. If an ARC buffer is written (and dirtied) which also exists in the
4105 * L2ARC, the now stale L2ARC buffer is immediately dropped.
4106 *
4107 * The performance of the L2ARC can be tweaked by a number of tunables, which
4108 * may be necessary for different workloads:
4109 *
4110 * l2arc_write_max max write bytes per interval
4111 * l2arc_write_boost extra write bytes during device warmup
4112 * l2arc_noprefetch skip caching prefetched buffers
4113 * l2arc_nocompress skip compressing buffers
4114 * l2arc_headroom number of max device writes to precache
4115 * l2arc_headroom_boost when we find compressed buffers during ARC
4116 * scanning, we multiply headroom by this
4117 * percentage factor for the next scan cycle,
4118 * since more compressed buffers are likely to
4119 * be present
4120 * l2arc_feed_secs seconds between L2ARC writing
4121 *
4122 * Tunables may be removed or added as future performance improvements are
4123 * integrated, and also may become zpool properties.
4124 *
4125 * There are three key functions that control how the L2ARC warms up:
4126 *
4127 * l2arc_write_eligible() check if a buffer is eligible to cache
4128 * l2arc_write_size() calculate how much to write
4129 * l2arc_write_interval() calculate sleep delay between writes
4130 *
4131 * These three functions determine what to write, how much, and how quickly
4132 * to send writes.
4133 */
4134
4135 static boolean_t
4136 l2arc_write_eligible(uint64_t spa_guid, arc_buf_hdr_t *ab)
4137 {
4138 /*
4139 * A buffer is *not* eligible for the L2ARC if it:
4140 * 1. belongs to a different spa.
4141 * 2. is already cached on the L2ARC.
4142 * 3. has an I/O in progress (it may be an incomplete read).
4143 * 4. is flagged not eligible (zfs property).
4144 */
4145 if (ab->b_spa != spa_guid || ab->b_l2hdr != NULL ||
4146 HDR_IO_IN_PROGRESS(ab) || !HDR_L2CACHE(ab))
4147 return (B_FALSE);
4148
4149 return (B_TRUE);
4150 }
4151
4152 static uint64_t
4153 l2arc_write_size(void)
4154 {
4155 uint64_t size;
4156
4157 /*
4158 * Make sure our globals have meaningful values in case the user
4159 * altered them.
4160 */
4161 size = l2arc_write_max;
4162 if (size == 0) {
4163 cmn_err(CE_NOTE, "Bad value for l2arc_write_max, value must "
4164 "be greater than zero, resetting it to the default (%d)",
4165 L2ARC_WRITE_SIZE);
4166 size = l2arc_write_max = L2ARC_WRITE_SIZE;
4167 }
4168
4169 if (arc_warm == B_FALSE)
4170 size += l2arc_write_boost;
4171
4172 return (size);
4173
4174 }
4175
4176 static clock_t
4177 l2arc_write_interval(clock_t began, uint64_t wanted, uint64_t wrote)
4178 {
4179 clock_t interval, next, now;
4180
4181 /*
4182 * If the ARC lists are busy, increase our write rate; if the
4183 * lists are stale, idle back. This is achieved by checking
4184 * how much we previously wrote - if it was more than half of
4185 * what we wanted, schedule the next write much sooner.
4186 */
4187 if (l2arc_feed_again && wrote > (wanted / 2))
4188 interval = (hz * l2arc_feed_min_ms) / 1000;
4189 else
4190 interval = hz * l2arc_feed_secs;
4191
4192 now = ddi_get_lbolt();
4193 next = MAX(now, MIN(now + interval, began + interval));
4194
4195 return (next);
4196 }
4197
4198 static void
4199 l2arc_hdr_stat_add(void)
4200 {
4201 ARCSTAT_INCR(arcstat_l2_hdr_size, HDR_SIZE);
4202 ARCSTAT_INCR(arcstat_hdr_size, -HDR_SIZE);
4203 }
4204
4205 static void
4206 l2arc_hdr_stat_remove(void)
4207 {
4208 ARCSTAT_INCR(arcstat_l2_hdr_size, -HDR_SIZE);
4209 ARCSTAT_INCR(arcstat_hdr_size, HDR_SIZE);
4210 }
4211
4212 /*
4213 * Cycle through L2ARC devices. This is how L2ARC load balances.
4214 * If a device is returned, this also returns holding the spa config lock.
4215 */
4216 static l2arc_dev_t *
4217 l2arc_dev_get_next(void)
4218 {
4219 l2arc_dev_t *first, *next = NULL;
4220
4221 /*
4222 * Lock out the removal of spas (spa_namespace_lock), then removal
4223 * of cache devices (l2arc_dev_mtx). Once a device has been selected,
4224 * both locks will be dropped and a spa config lock held instead.
4225 */
4226 mutex_enter(&spa_namespace_lock);
4227 mutex_enter(&l2arc_dev_mtx);
4228
4229 /* if there are no vdevs, there is nothing to do */
4230 if (l2arc_ndev == 0)
4231 goto out;
4232
4233 first = NULL;
4234 next = l2arc_dev_last;
4235 do {
4236 /* loop around the list looking for a non-faulted vdev */
4237 if (next == NULL) {
4238 next = list_head(l2arc_dev_list);
4239 } else {
4240 next = list_next(l2arc_dev_list, next);
4241 if (next == NULL)
4242 next = list_head(l2arc_dev_list);
4243 }
4244
4245 /* if we have come back to the start, bail out */
4246 if (first == NULL)
4247 first = next;
4248 else if (next == first)
4249 break;
4250
4251 } while (vdev_is_dead(next->l2ad_vdev));
4252
4253 /* if we were unable to find any usable vdevs, return NULL */
4254 if (vdev_is_dead(next->l2ad_vdev))
4255 next = NULL;
4256
4257 l2arc_dev_last = next;
4258
4259 out:
4260 mutex_exit(&l2arc_dev_mtx);
4261
4262 /*
4263 * Grab the config lock to prevent the 'next' device from being
4264 * removed while we are writing to it.
4265 */
4266 if (next != NULL)
4267 spa_config_enter(next->l2ad_spa, SCL_L2ARC, next, RW_READER);
4268 mutex_exit(&spa_namespace_lock);
4269
4270 return (next);
4271 }
4272
4273 /*
4274 * Free buffers that were tagged for destruction.
4275 */
4276 static void
4277 l2arc_do_free_on_write(void)
4278 {
4279 list_t *buflist;
4280 l2arc_data_free_t *df, *df_prev;
4281
4282 mutex_enter(&l2arc_free_on_write_mtx);
4283 buflist = l2arc_free_on_write;
4284
4285 for (df = list_tail(buflist); df; df = df_prev) {
4286 df_prev = list_prev(buflist, df);
4287 ASSERT(df->l2df_data != NULL);
4288 ASSERT(df->l2df_func != NULL);
4289 df->l2df_func(df->l2df_data, df->l2df_size);
4290 list_remove(buflist, df);
4291 kmem_free(df, sizeof (l2arc_data_free_t));
4292 }
4293
4294 mutex_exit(&l2arc_free_on_write_mtx);
4295 }
4296
4297 /*
4298 * A write to a cache device has completed. Update all headers to allow
4299 * reads from these buffers to begin.
4300 */
4301 static void
4302 l2arc_write_done(zio_t *zio)
4303 {
4304 l2arc_write_callback_t *cb;
4305 l2arc_dev_t *dev;
4306 list_t *buflist;
4307 arc_buf_hdr_t *head, *ab, *ab_prev;
4308 l2arc_buf_hdr_t *abl2;
4309 kmutex_t *hash_lock;
4310
4311 cb = zio->io_private;
4312 ASSERT(cb != NULL);
4313 dev = cb->l2wcb_dev;
4314 ASSERT(dev != NULL);
4315 head = cb->l2wcb_head;
4316 ASSERT(head != NULL);
4317 buflist = dev->l2ad_buflist;
4318 ASSERT(buflist != NULL);
4319 DTRACE_PROBE2(l2arc__iodone, zio_t *, zio,
4320 l2arc_write_callback_t *, cb);
4321
4322 if (zio->io_error != 0)
4323 ARCSTAT_BUMP(arcstat_l2_writes_error);
4324
4325 mutex_enter(&l2arc_buflist_mtx);
4326
4327 /*
4328 * All writes completed, or an error was hit.
4329 */
4330 for (ab = list_prev(buflist, head); ab; ab = ab_prev) {
4331 ab_prev = list_prev(buflist, ab);
4332
4333 hash_lock = HDR_LOCK(ab);
4334 if (!mutex_tryenter(hash_lock)) {
4335 /*
4336 * This buffer misses out. It may be in a stage
4337 * of eviction. Its ARC_L2_WRITING flag will be
4338 * left set, denying reads to this buffer.
4339 */
4340 ARCSTAT_BUMP(arcstat_l2_writes_hdr_miss);
4341 continue;
4342 }
4343
4344 abl2 = ab->b_l2hdr;
4345
4346 /*
4347 * Release the temporary compressed buffer as soon as possible.
4348 */
4349 if (abl2->b_compress != ZIO_COMPRESS_OFF)
4350 l2arc_release_cdata_buf(ab);
4351
4352 if (zio->io_error != 0) {
4353 /*
4354 * Error - drop L2ARC entry.
4355 */
4356 list_remove(buflist, ab);
4357 ARCSTAT_INCR(arcstat_l2_asize, -abl2->b_asize);
4358 ab->b_l2hdr = NULL;
4359 kmem_free(abl2, sizeof (l2arc_buf_hdr_t));
4360 arc_space_return(L2HDR_SIZE, ARC_SPACE_L2HDRS);
4361 ARCSTAT_INCR(arcstat_l2_size, -ab->b_size);
4362 }
4363
4364 /*
4365 * Allow ARC to begin reads to this L2ARC entry.
4366 */
4367 ab->b_flags &= ~ARC_L2_WRITING;
4368
4369 mutex_exit(hash_lock);
4370 }
4371
4372 atomic_inc_64(&l2arc_writes_done);
4373 list_remove(buflist, head);
4374 kmem_cache_free(hdr_cache, head);
4375 mutex_exit(&l2arc_buflist_mtx);
4376
4377 l2arc_do_free_on_write();
4378
4379 kmem_free(cb, sizeof (l2arc_write_callback_t));
4380 }
4381
4382 /*
4383 * A read to a cache device completed. Validate buffer contents before
4384 * handing over to the regular ARC routines.
4385 */
4386 static void
4387 l2arc_read_done(zio_t *zio)
4388 {
4389 l2arc_read_callback_t *cb;
4390 arc_buf_hdr_t *hdr;
4391 arc_buf_t *buf;
4392 kmutex_t *hash_lock;
4393 int equal;
4394
4395 ASSERT(zio->io_vd != NULL);
4396 ASSERT(zio->io_flags & ZIO_FLAG_DONT_PROPAGATE);
4397
4398 spa_config_exit(zio->io_spa, SCL_L2ARC, zio->io_vd);
4399
4400 cb = zio->io_private;
4401 ASSERT(cb != NULL);
4402 buf = cb->l2rcb_buf;
4403 ASSERT(buf != NULL);
4404
4405 hash_lock = HDR_LOCK(buf->b_hdr);
4406 mutex_enter(hash_lock);
4407 hdr = buf->b_hdr;
4408 ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
4409
4410 /*
4411 * If the buffer was compressed, decompress it first.
4412 */
4413 if (cb->l2rcb_compress != ZIO_COMPRESS_OFF)
4414 l2arc_decompress_zio(zio, hdr, cb->l2rcb_compress);
4415 ASSERT(zio->io_data != NULL);
4416
4417 /*
4418 * Check this survived the L2ARC journey.
4419 */
4420 equal = arc_cksum_equal(buf);
4421 if (equal && zio->io_error == 0 && !HDR_L2_EVICTED(hdr)) {
4422 mutex_exit(hash_lock);
4423 zio->io_private = buf;
4424 zio->io_bp_copy = cb->l2rcb_bp; /* XXX fix in L2ARC 2.0 */
4425 zio->io_bp = &zio->io_bp_copy; /* XXX fix in L2ARC 2.0 */
4426 arc_read_done(zio);
4427 } else {
4428 mutex_exit(hash_lock);
4429 /*
4430 * Buffer didn't survive caching. Increment stats and
4431 * reissue to the original storage device.
4432 */
4433 if (zio->io_error != 0) {
4434 ARCSTAT_BUMP(arcstat_l2_io_error);
4435 } else {
4436 zio->io_error = EIO;
4437 }
4438 if (!equal)
4439 ARCSTAT_BUMP(arcstat_l2_cksum_bad);
4440
4441 /*
4442 * If there's no waiter, issue an async i/o to the primary
4443 * storage now. If there *is* a waiter, the caller must
4444 * issue the i/o in a context where it's OK to block.
4445 */
4446 if (zio->io_waiter == NULL) {
4447 zio_t *pio = zio_unique_parent(zio);
4448
4449 ASSERT(!pio || pio->io_child_type == ZIO_CHILD_LOGICAL);
4450
4451 zio_nowait(zio_read(pio, cb->l2rcb_spa, &cb->l2rcb_bp,
4452 buf->b_data, zio->io_size, arc_read_done, buf,
4453 zio->io_priority, cb->l2rcb_flags, &cb->l2rcb_zb));
4454 }
4455 }
4456
4457 kmem_free(cb, sizeof (l2arc_read_callback_t));
4458 }
4459
4460 /*
4461 * This is the list priority from which the L2ARC will search for pages to
4462 * cache. This is used within loops (0..3) to cycle through lists in the
4463 * desired order. This order can have a significant effect on cache
4464 * performance.
4465 *
4466 * Currently the metadata lists are hit first, MFU then MRU, followed by
4467 * the data lists. This function returns a locked list, and also returns
4468 * the lock pointer.
4469 */
4470 static list_t *
4471 l2arc_list_locked(int list_num, kmutex_t **lock)
4472 {
4473 list_t *list = NULL;
4474
4475 ASSERT(list_num >= 0 && list_num <= 3);
4476
4477 switch (list_num) {
4478 case 0:
4479 list = &arc_mfu->arcs_list[ARC_BUFC_METADATA];
4480 *lock = &arc_mfu->arcs_mtx;
4481 break;
4482 case 1:
4483 list = &arc_mru->arcs_list[ARC_BUFC_METADATA];
4484 *lock = &arc_mru->arcs_mtx;
4485 break;
4486 case 2:
4487 list = &arc_mfu->arcs_list[ARC_BUFC_DATA];
4488 *lock = &arc_mfu->arcs_mtx;
4489 break;
4490 case 3:
4491 list = &arc_mru->arcs_list[ARC_BUFC_DATA];
4492 *lock = &arc_mru->arcs_mtx;
4493 break;
4494 }
4495
4496 ASSERT(!(MUTEX_HELD(*lock)));
4497 mutex_enter(*lock);
4498 return (list);
4499 }
4500
4501 /*
4502 * Evict buffers from the device write hand to the distance specified in
4503 * bytes. This distance may span populated buffers, it may span nothing.
4504 * This is clearing a region on the L2ARC device ready for writing.
4505 * If the 'all' boolean is set, every buffer is evicted.
4506 */
4507 static void
4508 l2arc_evict(l2arc_dev_t *dev, uint64_t distance, boolean_t all)
4509 {
4510 list_t *buflist;
4511 l2arc_buf_hdr_t *abl2;
4512 arc_buf_hdr_t *ab, *ab_prev;
4513 kmutex_t *hash_lock;
4514 uint64_t taddr;
4515
4516 buflist = dev->l2ad_buflist;
4517
4518 if (buflist == NULL)
4519 return;
4520
4521 if (!all && dev->l2ad_first) {
4522 /*
4523 * This is the first sweep through the device. There is
4524 * nothing to evict.
4525 */
4526 return;
4527 }
4528
4529 if (dev->l2ad_hand >= (dev->l2ad_end - (2 * distance))) {
4530 /*
4531 * When nearing the end of the device, evict to the end
4532 * before the device write hand jumps to the start.
4533 */
4534 taddr = dev->l2ad_end;
4535 } else {
4536 taddr = dev->l2ad_hand + distance;
4537 }
4538 DTRACE_PROBE4(l2arc__evict, l2arc_dev_t *, dev, list_t *, buflist,
4539 uint64_t, taddr, boolean_t, all);
4540
4541 top:
4542 mutex_enter(&l2arc_buflist_mtx);
4543 for (ab = list_tail(buflist); ab; ab = ab_prev) {
4544 ab_prev = list_prev(buflist, ab);
4545
4546 hash_lock = HDR_LOCK(ab);
4547 if (!mutex_tryenter(hash_lock)) {
4548 /*
4549 * Missed the hash lock. Retry.
4550 */
4551 ARCSTAT_BUMP(arcstat_l2_evict_lock_retry);
4552 mutex_exit(&l2arc_buflist_mtx);
4553 mutex_enter(hash_lock);
4554 mutex_exit(hash_lock);
4555 goto top;
4556 }
4557
4558 if (HDR_L2_WRITE_HEAD(ab)) {
4559 /*
4560 * We hit a write head node. Leave it for
4561 * l2arc_write_done().
4562 */
4563 list_remove(buflist, ab);
4564 mutex_exit(hash_lock);
4565 continue;
4566 }
4567
4568 if (!all && ab->b_l2hdr != NULL &&
4569 (ab->b_l2hdr->b_daddr > taddr ||
4570 ab->b_l2hdr->b_daddr < dev->l2ad_hand)) {
4571 /*
4572 * We've evicted to the target address,
4573 * or the end of the device.
4574 */
4575 mutex_exit(hash_lock);
4576 break;
4577 }
4578
4579 if (HDR_FREE_IN_PROGRESS(ab)) {
4580 /*
4581 * Already on the path to destruction.
4582 */
4583 mutex_exit(hash_lock);
4584 continue;
4585 }
4586
4587 if (ab->b_state == arc_l2c_only) {
4588 ASSERT(!HDR_L2_READING(ab));
4589 /*
4590 * This doesn't exist in the ARC. Destroy.
4591 * arc_hdr_destroy() will call list_remove()
4592 * and decrement arcstat_l2_size.
4593 */
4594 arc_change_state(arc_anon, ab, hash_lock);
4595 arc_hdr_destroy(ab);
4596 } else {
4597 /*
4598 * Invalidate issued or about to be issued
4599 * reads, since we may be about to write
4600 * over this location.
4601 */
4602 if (HDR_L2_READING(ab)) {
4603 ARCSTAT_BUMP(arcstat_l2_evict_reading);
4604 ab->b_flags |= ARC_L2_EVICTED;
4605 }
4606
4607 /*
4608 * Tell ARC this no longer exists in L2ARC.
4609 */
4610 if (ab->b_l2hdr != NULL) {
4611 abl2 = ab->b_l2hdr;
4612 ARCSTAT_INCR(arcstat_l2_asize, -abl2->b_asize);
4613 ab->b_l2hdr = NULL;
4614 kmem_free(abl2, sizeof (l2arc_buf_hdr_t));
4615 arc_space_return(L2HDR_SIZE, ARC_SPACE_L2HDRS);
4616 ARCSTAT_INCR(arcstat_l2_size, -ab->b_size);
4617 }
4618 list_remove(buflist, ab);
4619
4620 /*
4621 * This may have been leftover after a
4622 * failed write.
4623 */
4624 ab->b_flags &= ~ARC_L2_WRITING;
4625 }
4626 mutex_exit(hash_lock);
4627 }
4628 mutex_exit(&l2arc_buflist_mtx);
4629
4630 vdev_space_update(dev->l2ad_vdev, -(taddr - dev->l2ad_evict), 0, 0);
4631 dev->l2ad_evict = taddr;
4632 }
4633
4634 /*
4635 * Find and write ARC buffers to the L2ARC device.
4636 *
4637 * An ARC_L2_WRITING flag is set so that the L2ARC buffers are not valid
4638 * for reading until they have completed writing.
4639 * The headroom_boost is an in-out parameter used to maintain headroom boost
4640 * state between calls to this function.
4641 *
4642 * Returns the number of bytes actually written (which may be smaller than
4643 * the delta by which the device hand has changed due to alignment).
4644 */
4645 static uint64_t
4646 l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz,
4647 boolean_t *headroom_boost)
4648 {
4649 arc_buf_hdr_t *ab, *ab_prev, *head;
4650 list_t *list;
4651 uint64_t write_asize, write_psize, write_sz, headroom,
4652 buf_compress_minsz;
4653 void *buf_data;
4654 kmutex_t *list_lock = NULL;
4655 boolean_t full;
4656 l2arc_write_callback_t *cb;
4657 zio_t *pio, *wzio;
4658 uint64_t guid = spa_load_guid(spa);
4659 int try;
4660 const boolean_t do_headroom_boost = *headroom_boost;
4661
4662 ASSERT(dev->l2ad_vdev != NULL);
4663
4664 /* Lower the flag now, we might want to raise it again later. */
4665 *headroom_boost = B_FALSE;
4666
4667 pio = NULL;
4668 write_sz = write_asize = write_psize = 0;
4669 full = B_FALSE;
4670 head = kmem_cache_alloc(hdr_cache, KM_PUSHPAGE);
4671 head->b_flags |= ARC_L2_WRITE_HEAD;
4672
4673 /*
4674 * We will want to try to compress buffers that are at least 2x the
4675 * device sector size.
4676 */
4677 buf_compress_minsz = 2 << dev->l2ad_vdev->vdev_ashift;
4678
4679 /*
4680 * Copy buffers for L2ARC writing.
4681 */
4682 mutex_enter(&l2arc_buflist_mtx);
4683 for (try = 0; try <= 3; try++) {
4684 uint64_t passed_sz = 0;
4685
4686 list = l2arc_list_locked(try, &list_lock);
4687
4688 /*
4689 * L2ARC fast warmup.
4690 *
4691 * Until the ARC is warm and starts to evict, read from the
4692 * head of the ARC lists rather than the tail.
4693 */
4694 if (arc_warm == B_FALSE)
4695 ab = list_head(list);
4696 else
4697 ab = list_tail(list);
4698
4699 headroom = target_sz * l2arc_headroom;
4700 if (do_headroom_boost)
4701 headroom = (headroom * l2arc_headroom_boost) / 100;
4702
4703 for (; ab; ab = ab_prev) {
4704 l2arc_buf_hdr_t *l2hdr;
4705 kmutex_t *hash_lock;
4706 uint64_t buf_sz;
4707
4708 if (arc_warm == B_FALSE)
4709 ab_prev = list_next(list, ab);
4710 else
4711 ab_prev = list_prev(list, ab);
4712
4713 hash_lock = HDR_LOCK(ab);
4714 if (!mutex_tryenter(hash_lock)) {
4715 /*
4716 * Skip this buffer rather than waiting.
4717 */
4718 continue;
4719 }
4720
4721 passed_sz += ab->b_size;
4722 if (passed_sz > headroom) {
4723 /*
4724 * Searched too far.
4725 */
4726 mutex_exit(hash_lock);
4727 break;
4728 }
4729
4730 if (!l2arc_write_eligible(guid, ab)) {
4731 mutex_exit(hash_lock);
4732 continue;
4733 }
4734
4735 if ((write_sz + ab->b_size) > target_sz) {
4736 full = B_TRUE;
4737 mutex_exit(hash_lock);
4738 break;
4739 }
4740
4741 if (pio == NULL) {
4742 /*
4743 * Insert a dummy header on the buflist so
4744 * l2arc_write_done() can find where the
4745 * write buffers begin without searching.
4746 */
4747 list_insert_head(dev->l2ad_buflist, head);
4748
4749 cb = kmem_alloc(sizeof (l2arc_write_callback_t),
4750 KM_PUSHPAGE);
4751 cb->l2wcb_dev = dev;
4752 cb->l2wcb_head = head;
4753 pio = zio_root(spa, l2arc_write_done, cb,
4754 ZIO_FLAG_CANFAIL);
4755 }
4756
4757 /*
4758 * Create and add a new L2ARC header.
4759 */
4760 l2hdr = kmem_zalloc(sizeof (l2arc_buf_hdr_t),
4761 KM_PUSHPAGE);
4762 l2hdr->b_dev = dev;
4763 arc_space_consume(L2HDR_SIZE, ARC_SPACE_L2HDRS);
4764
4765 ab->b_flags |= ARC_L2_WRITING;
4766
4767 /*
4768 * Temporarily stash the data buffer in b_tmp_cdata.
4769 * The subsequent write step will pick it up from
4770 * there. This is because can't access ab->b_buf
4771 * without holding the hash_lock, which we in turn
4772 * can't access without holding the ARC list locks
4773 * (which we want to avoid during compression/writing)
4774 */
4775 l2hdr->b_compress = ZIO_COMPRESS_OFF;
4776 l2hdr->b_asize = ab->b_size;
4777 l2hdr->b_tmp_cdata = ab->b_buf->b_data;
4778
4779 buf_sz = ab->b_size;
4780 ab->b_l2hdr = l2hdr;
4781
4782 list_insert_head(dev->l2ad_buflist, ab);
4783
4784 /*
4785 * Compute and store the buffer cksum before
4786 * writing. On debug the cksum is verified first.
4787 */
4788 arc_cksum_verify(ab->b_buf);
4789 arc_cksum_compute(ab->b_buf, B_TRUE);
4790
4791 mutex_exit(hash_lock);
4792
4793 write_sz += buf_sz;
4794 }
4795
4796 mutex_exit(list_lock);
4797
4798 if (full == B_TRUE)
4799 break;
4800 }
4801
4802 /* No buffers selected for writing? */
4803 if (pio == NULL) {
4804 ASSERT0(write_sz);
4805 mutex_exit(&l2arc_buflist_mtx);
4806 kmem_cache_free(hdr_cache, head);
4807 return (0);
4808 }
4809
4810 /*
4811 * Now start writing the buffers. We're starting at the write head
4812 * and work backwards, retracing the course of the buffer selector
4813 * loop above.
4814 */
4815 for (ab = list_prev(dev->l2ad_buflist, head); ab;
4816 ab = list_prev(dev->l2ad_buflist, ab)) {
4817 l2arc_buf_hdr_t *l2hdr;
4818 uint64_t buf_sz;
4819
4820 /*
4821 * We shouldn't need to lock the buffer here, since we flagged
4822 * it as ARC_L2_WRITING in the previous step, but we must take
4823 * care to only access its L2 cache parameters. In particular,
4824 * ab->b_buf may be invalid by now due to ARC eviction.
4825 */
4826 l2hdr = ab->b_l2hdr;
4827 l2hdr->b_daddr = dev->l2ad_hand;
4828
4829 if (!l2arc_nocompress && (ab->b_flags & ARC_L2COMPRESS) &&
4830 l2hdr->b_asize >= buf_compress_minsz) {
4831 if (l2arc_compress_buf(l2hdr)) {
4832 /*
4833 * If compression succeeded, enable headroom
4834 * boost on the next scan cycle.
4835 */
4836 *headroom_boost = B_TRUE;
4837 }
4838 }
4839
4840 /*
4841 * Pick up the buffer data we had previously stashed away
4842 * (and now potentially also compressed).
4843 */
4844 buf_data = l2hdr->b_tmp_cdata;
4845 buf_sz = l2hdr->b_asize;
4846
4847 /* Compression may have squashed the buffer to zero length. */
4848 if (buf_sz != 0) {
4849 uint64_t buf_p_sz;
4850
4851 wzio = zio_write_phys(pio, dev->l2ad_vdev,
4852 dev->l2ad_hand, buf_sz, buf_data, ZIO_CHECKSUM_OFF,
4853 NULL, NULL, ZIO_PRIORITY_ASYNC_WRITE,
4854 ZIO_FLAG_CANFAIL, B_FALSE);
4855
4856 DTRACE_PROBE2(l2arc__write, vdev_t *, dev->l2ad_vdev,
4857 zio_t *, wzio);
4858 (void) zio_nowait(wzio);
4859
4860 write_asize += buf_sz;
4861 /*
4862 * Keep the clock hand suitably device-aligned.
4863 */
4864 buf_p_sz = vdev_psize_to_asize(dev->l2ad_vdev, buf_sz);
4865 write_psize += buf_p_sz;
4866 dev->l2ad_hand += buf_p_sz;
4867 }
4868 }
4869
4870 mutex_exit(&l2arc_buflist_mtx);
4871
4872 ASSERT3U(write_asize, <=, target_sz);
4873 ARCSTAT_BUMP(arcstat_l2_writes_sent);
4874 ARCSTAT_INCR(arcstat_l2_write_bytes, write_asize);
4875 ARCSTAT_INCR(arcstat_l2_size, write_sz);
4876 ARCSTAT_INCR(arcstat_l2_asize, write_asize);
4877 vdev_space_update(dev->l2ad_vdev, write_psize, 0, 0);
4878
4879 /*
4880 * Bump device hand to the device start if it is approaching the end.
4881 * l2arc_evict() will already have evicted ahead for this case.
4882 */
4883 if (dev->l2ad_hand >= (dev->l2ad_end - target_sz)) {
4884 vdev_space_update(dev->l2ad_vdev,
4885 dev->l2ad_end - dev->l2ad_hand, 0, 0);
4886 dev->l2ad_hand = dev->l2ad_start;
4887 dev->l2ad_evict = dev->l2ad_start;
4888 dev->l2ad_first = B_FALSE;
4889 }
4890
4891 dev->l2ad_writing = B_TRUE;
4892 (void) zio_wait(pio);
4893 dev->l2ad_writing = B_FALSE;
4894
4895 return (write_asize);
4896 }
4897
4898 /*
4899 * Compresses an L2ARC buffer.
4900 * The data to be compressed must be prefilled in l2hdr->b_tmp_cdata and its
4901 * size in l2hdr->b_asize. This routine tries to compress the data and
4902 * depending on the compression result there are three possible outcomes:
4903 * *) The buffer was incompressible. The original l2hdr contents were left
4904 * untouched and are ready for writing to an L2 device.
4905 * *) The buffer was all-zeros, so there is no need to write it to an L2
4906 * device. To indicate this situation b_tmp_cdata is NULL'ed, b_asize is
4907 * set to zero and b_compress is set to ZIO_COMPRESS_EMPTY.
4908 * *) Compression succeeded and b_tmp_cdata was replaced with a temporary
4909 * data buffer which holds the compressed data to be written, and b_asize
4910 * tells us how much data there is. b_compress is set to the appropriate
4911 * compression algorithm. Once writing is done, invoke
4912 * l2arc_release_cdata_buf on this l2hdr to free this temporary buffer.
4913 *
4914 * Returns B_TRUE if compression succeeded, or B_FALSE if it didn't (the
4915 * buffer was incompressible).
4916 */
4917 static boolean_t
4918 l2arc_compress_buf(l2arc_buf_hdr_t *l2hdr)
4919 {
4920 void *cdata;
4921 size_t csize, len;
4922
4923 ASSERT(l2hdr->b_compress == ZIO_COMPRESS_OFF);
4924 ASSERT(l2hdr->b_tmp_cdata != NULL);
4925
4926 len = l2hdr->b_asize;
4927 cdata = zio_data_buf_alloc(len);
4928 csize = zio_compress_data(ZIO_COMPRESS_LZ4, l2hdr->b_tmp_cdata,
4929 cdata, l2hdr->b_asize);
4930
4931 if (csize == 0) {
4932 /* zero block, indicate that there's nothing to write */
4933 zio_data_buf_free(cdata, len);
4934 l2hdr->b_compress = ZIO_COMPRESS_EMPTY;
4935 l2hdr->b_asize = 0;
4936 l2hdr->b_tmp_cdata = NULL;
4937 ARCSTAT_BUMP(arcstat_l2_compress_zeros);
4938 return (B_TRUE);
4939 } else if (csize > 0 && csize < len) {
4940 /*
4941 * Compression succeeded, we'll keep the cdata around for
4942 * writing and release it afterwards.
4943 */
4944 l2hdr->b_compress = ZIO_COMPRESS_LZ4;
4945 l2hdr->b_asize = csize;
4946 l2hdr->b_tmp_cdata = cdata;
4947 ARCSTAT_BUMP(arcstat_l2_compress_successes);
4948 return (B_TRUE);
4949 } else {
4950 /*
4951 * Compression failed, release the compressed buffer.
4952 * l2hdr will be left unmodified.
4953 */
4954 zio_data_buf_free(cdata, len);
4955 ARCSTAT_BUMP(arcstat_l2_compress_failures);
4956 return (B_FALSE);
4957 }
4958 }
4959
4960 /*
4961 * Decompresses a zio read back from an l2arc device. On success, the
4962 * underlying zio's io_data buffer is overwritten by the uncompressed
4963 * version. On decompression error (corrupt compressed stream), the
4964 * zio->io_error value is set to signal an I/O error.
4965 *
4966 * Please note that the compressed data stream is not checksummed, so
4967 * if the underlying device is experiencing data corruption, we may feed
4968 * corrupt data to the decompressor, so the decompressor needs to be
4969 * able to handle this situation (LZ4 does).
4970 */
4971 static void
4972 l2arc_decompress_zio(zio_t *zio, arc_buf_hdr_t *hdr, enum zio_compress c)
4973 {
4974 uint64_t csize;
4975 void *cdata;
4976
4977 ASSERT(L2ARC_IS_VALID_COMPRESS(c));
4978
4979 if (zio->io_error != 0) {
4980 /*
4981 * An io error has occured, just restore the original io
4982 * size in preparation for a main pool read.
4983 */
4984 zio->io_orig_size = zio->io_size = hdr->b_size;
4985 return;
4986 }
4987
4988 if (c == ZIO_COMPRESS_EMPTY) {
4989 /*
4990 * An empty buffer results in a null zio, which means we
4991 * need to fill its io_data after we're done restoring the
4992 * buffer's contents.
4993 */
4994 ASSERT(hdr->b_buf != NULL);
4995 bzero(hdr->b_buf->b_data, hdr->b_size);
4996 zio->io_data = zio->io_orig_data = hdr->b_buf->b_data;
4997 } else {
4998 ASSERT(zio->io_data != NULL);
4999 /*
5000 * We copy the compressed data from the start of the arc buffer
5001 * (the zio_read will have pulled in only what we need, the
5002 * rest is garbage which we will overwrite at decompression)
5003 * and then decompress back to the ARC data buffer. This way we
5004 * can minimize copying by simply decompressing back over the
5005 * original compressed data (rather than decompressing to an
5006 * aux buffer and then copying back the uncompressed buffer,
5007 * which is likely to be much larger).
5008 */
5009 csize = zio->io_size;
5010 cdata = zio_data_buf_alloc(csize);
5011 bcopy(zio->io_data, cdata, csize);
5012 if (zio_decompress_data(c, cdata, zio->io_data, csize,
5013 hdr->b_size) != 0)
5014 zio->io_error = EIO;
5015 zio_data_buf_free(cdata, csize);
5016 }
5017
5018 /* Restore the expected uncompressed IO size. */
5019 zio->io_orig_size = zio->io_size = hdr->b_size;
5020 }
5021
5022 /*
5023 * Releases the temporary b_tmp_cdata buffer in an l2arc header structure.
5024 * This buffer serves as a temporary holder of compressed data while
5025 * the buffer entry is being written to an l2arc device. Once that is
5026 * done, we can dispose of it.
5027 */
5028 static void
5029 l2arc_release_cdata_buf(arc_buf_hdr_t *ab)
5030 {
5031 l2arc_buf_hdr_t *l2hdr = ab->b_l2hdr;
5032
5033 if (l2hdr->b_compress == ZIO_COMPRESS_LZ4) {
5034 /*
5035 * If the data was compressed, then we've allocated a
5036 * temporary buffer for it, so now we need to release it.
5037 */
5038 ASSERT(l2hdr->b_tmp_cdata != NULL);
5039 zio_data_buf_free(l2hdr->b_tmp_cdata, ab->b_size);
5040 }
5041 l2hdr->b_tmp_cdata = NULL;
5042 }
5043
5044 /*
5045 * This thread feeds the L2ARC at regular intervals. This is the beating
5046 * heart of the L2ARC.
5047 */
5048 static void
5049 l2arc_feed_thread(void)
5050 {
5051 callb_cpr_t cpr;
5052 l2arc_dev_t *dev;
5053 spa_t *spa;
5054 uint64_t size, wrote;
5055 clock_t begin, next = ddi_get_lbolt();
5056 boolean_t headroom_boost = B_FALSE;
5057
5058 CALLB_CPR_INIT(&cpr, &l2arc_feed_thr_lock, callb_generic_cpr, FTAG);
5059
5060 mutex_enter(&l2arc_feed_thr_lock);
5061
5062 while (l2arc_thread_exit == 0) {
5063 CALLB_CPR_SAFE_BEGIN(&cpr);
5064 (void) cv_timedwait_interruptible(&l2arc_feed_thr_cv,
5065 &l2arc_feed_thr_lock, next);
5066 CALLB_CPR_SAFE_END(&cpr, &l2arc_feed_thr_lock);
5067 next = ddi_get_lbolt() + hz;
5068
5069 /*
5070 * Quick check for L2ARC devices.
5071 */
5072 mutex_enter(&l2arc_dev_mtx);
5073 if (l2arc_ndev == 0) {
5074 mutex_exit(&l2arc_dev_mtx);
5075 continue;
5076 }
5077 mutex_exit(&l2arc_dev_mtx);
5078 begin = ddi_get_lbolt();
5079
5080 /*
5081 * This selects the next l2arc device to write to, and in
5082 * doing so the next spa to feed from: dev->l2ad_spa. This
5083 * will return NULL if there are now no l2arc devices or if
5084 * they are all faulted.
5085 *
5086 * If a device is returned, its spa's config lock is also
5087 * held to prevent device removal. l2arc_dev_get_next()
5088 * will grab and release l2arc_dev_mtx.
5089 */
5090 if ((dev = l2arc_dev_get_next()) == NULL)
5091 continue;
5092
5093 spa = dev->l2ad_spa;
5094 ASSERT(spa != NULL);
5095
5096 /*
5097 * If the pool is read-only then force the feed thread to
5098 * sleep a little longer.
5099 */
5100 if (!spa_writeable(spa)) {
5101 next = ddi_get_lbolt() + 5 * l2arc_feed_secs * hz;
5102 spa_config_exit(spa, SCL_L2ARC, dev);
5103 continue;
5104 }
5105
5106 /*
5107 * Avoid contributing to memory pressure.
5108 */
5109 if (arc_no_grow) {
5110 ARCSTAT_BUMP(arcstat_l2_abort_lowmem);
5111 spa_config_exit(spa, SCL_L2ARC, dev);
5112 continue;
5113 }
5114
5115 ARCSTAT_BUMP(arcstat_l2_feeds);
5116
5117 size = l2arc_write_size();
5118
5119 /*
5120 * Evict L2ARC buffers that will be overwritten.
5121 */
5122 l2arc_evict(dev, size, B_FALSE);
5123
5124 /*
5125 * Write ARC buffers.
5126 */
5127 wrote = l2arc_write_buffers(spa, dev, size, &headroom_boost);
5128
5129 /*
5130 * Calculate interval between writes.
5131 */
5132 next = l2arc_write_interval(begin, size, wrote);
5133 spa_config_exit(spa, SCL_L2ARC, dev);
5134 }
5135
5136 l2arc_thread_exit = 0;
5137 cv_broadcast(&l2arc_feed_thr_cv);
5138 CALLB_CPR_EXIT(&cpr); /* drops l2arc_feed_thr_lock */
5139 thread_exit();
5140 }
5141
5142 boolean_t
5143 l2arc_vdev_present(vdev_t *vd)
5144 {
5145 l2arc_dev_t *dev;
5146
5147 mutex_enter(&l2arc_dev_mtx);
5148 for (dev = list_head(l2arc_dev_list); dev != NULL;
5149 dev = list_next(l2arc_dev_list, dev)) {
5150 if (dev->l2ad_vdev == vd)
5151 break;
5152 }
5153 mutex_exit(&l2arc_dev_mtx);
5154
5155 return (dev != NULL);
5156 }
5157
5158 /*
5159 * Add a vdev for use by the L2ARC. By this point the spa has already
5160 * validated the vdev and opened it.
5161 */
5162 void
5163 l2arc_add_vdev(spa_t *spa, vdev_t *vd)
5164 {
5165 l2arc_dev_t *adddev;
5166
5167 ASSERT(!l2arc_vdev_present(vd));
5168
5169 /*
5170 * Create a new l2arc device entry.
5171 */
5172 adddev = kmem_zalloc(sizeof (l2arc_dev_t), KM_SLEEP);
5173 adddev->l2ad_spa = spa;
5174 adddev->l2ad_vdev = vd;
5175 adddev->l2ad_start = VDEV_LABEL_START_SIZE;
5176 adddev->l2ad_end = VDEV_LABEL_START_SIZE + vdev_get_min_asize(vd);
5177 adddev->l2ad_hand = adddev->l2ad_start;
5178 adddev->l2ad_evict = adddev->l2ad_start;
5179 adddev->l2ad_first = B_TRUE;
5180 adddev->l2ad_writing = B_FALSE;
5181 list_link_init(&adddev->l2ad_node);
5182
5183 /*
5184 * This is a list of all ARC buffers that are still valid on the
5185 * device.
5186 */
5187 adddev->l2ad_buflist = kmem_zalloc(sizeof (list_t), KM_SLEEP);
5188 list_create(adddev->l2ad_buflist, sizeof (arc_buf_hdr_t),
5189 offsetof(arc_buf_hdr_t, b_l2node));
5190
5191 vdev_space_update(vd, 0, 0, adddev->l2ad_end - adddev->l2ad_hand);
5192
5193 /*
5194 * Add device to global list
5195 */
5196 mutex_enter(&l2arc_dev_mtx);
5197 list_insert_head(l2arc_dev_list, adddev);
5198 atomic_inc_64(&l2arc_ndev);
5199 mutex_exit(&l2arc_dev_mtx);
5200 }
5201
5202 /*
5203 * Remove a vdev from the L2ARC.
5204 */
5205 void
5206 l2arc_remove_vdev(vdev_t *vd)
5207 {
5208 l2arc_dev_t *dev, *nextdev, *remdev = NULL;
5209
5210 /*
5211 * Find the device by vdev
5212 */
5213 mutex_enter(&l2arc_dev_mtx);
5214 for (dev = list_head(l2arc_dev_list); dev; dev = nextdev) {
5215 nextdev = list_next(l2arc_dev_list, dev);
5216 if (vd == dev->l2ad_vdev) {
5217 remdev = dev;
5218 break;
5219 }
5220 }
5221 ASSERT(remdev != NULL);
5222
5223 /*
5224 * Remove device from global list
5225 */
5226 list_remove(l2arc_dev_list, remdev);
5227 l2arc_dev_last = NULL; /* may have been invalidated */
5228 atomic_dec_64(&l2arc_ndev);
5229 mutex_exit(&l2arc_dev_mtx);
5230
5231 /*
5232 * Clear all buflists and ARC references. L2ARC device flush.
5233 */
5234 l2arc_evict(remdev, 0, B_TRUE);
5235 list_destroy(remdev->l2ad_buflist);
5236 kmem_free(remdev->l2ad_buflist, sizeof (list_t));
5237 kmem_free(remdev, sizeof (l2arc_dev_t));
5238 }
5239
5240 void
5241 l2arc_init(void)
5242 {
5243 l2arc_thread_exit = 0;
5244 l2arc_ndev = 0;
5245 l2arc_writes_sent = 0;
5246 l2arc_writes_done = 0;
5247
5248 mutex_init(&l2arc_feed_thr_lock, NULL, MUTEX_DEFAULT, NULL);
5249 cv_init(&l2arc_feed_thr_cv, NULL, CV_DEFAULT, NULL);
5250 mutex_init(&l2arc_dev_mtx, NULL, MUTEX_DEFAULT, NULL);
5251 mutex_init(&l2arc_buflist_mtx, NULL, MUTEX_DEFAULT, NULL);
5252 mutex_init(&l2arc_free_on_write_mtx, NULL, MUTEX_DEFAULT, NULL);
5253
5254 l2arc_dev_list = &L2ARC_dev_list;
5255 l2arc_free_on_write = &L2ARC_free_on_write;
5256 list_create(l2arc_dev_list, sizeof (l2arc_dev_t),
5257 offsetof(l2arc_dev_t, l2ad_node));
5258 list_create(l2arc_free_on_write, sizeof (l2arc_data_free_t),
5259 offsetof(l2arc_data_free_t, l2df_list_node));
5260 }
5261
5262 void
5263 l2arc_fini(void)
5264 {
5265 /*
5266 * This is called from dmu_fini(), which is called from spa_fini();
5267 * Because of this, we can assume that all l2arc devices have
5268 * already been removed when the pools themselves were removed.
5269 */
5270
5271 l2arc_do_free_on_write();
5272
5273 mutex_destroy(&l2arc_feed_thr_lock);
5274 cv_destroy(&l2arc_feed_thr_cv);
5275 mutex_destroy(&l2arc_dev_mtx);
5276 mutex_destroy(&l2arc_buflist_mtx);
5277 mutex_destroy(&l2arc_free_on_write_mtx);
5278
5279 list_destroy(l2arc_dev_list);
5280 list_destroy(l2arc_free_on_write);
5281 }
5282
5283 void
5284 l2arc_start(void)
5285 {
5286 if (!(spa_mode_global & FWRITE))
5287 return;
5288
5289 (void) thread_create(NULL, 0, l2arc_feed_thread, NULL, 0, &p0,
5290 TS_RUN, minclsyspri);
5291 }
5292
5293 void
5294 l2arc_stop(void)
5295 {
5296 if (!(spa_mode_global & FWRITE))
5297 return;
5298
5299 mutex_enter(&l2arc_feed_thr_lock);
5300 cv_signal(&l2arc_feed_thr_cv); /* kick thread out of startup */
5301 l2arc_thread_exit = 1;
5302 while (l2arc_thread_exit != 0)
5303 cv_wait(&l2arc_feed_thr_cv, &l2arc_feed_thr_lock);
5304 mutex_exit(&l2arc_feed_thr_lock);
5305 }
5306
5307 #if defined(_KERNEL) && defined(HAVE_SPL)
5308 EXPORT_SYMBOL(arc_read);
5309 EXPORT_SYMBOL(arc_buf_remove_ref);
5310 EXPORT_SYMBOL(arc_getbuf_func);
5311 EXPORT_SYMBOL(arc_add_prune_callback);
5312 EXPORT_SYMBOL(arc_remove_prune_callback);
5313
5314 module_param(zfs_arc_min, ulong, 0644);
5315 MODULE_PARM_DESC(zfs_arc_min, "Min arc size");
5316
5317 module_param(zfs_arc_max, ulong, 0644);
5318 MODULE_PARM_DESC(zfs_arc_max, "Max arc size");
5319
5320 module_param(zfs_arc_meta_limit, ulong, 0644);
5321 MODULE_PARM_DESC(zfs_arc_meta_limit, "Meta limit for arc size");
5322
5323 module_param(zfs_arc_meta_prune, int, 0644);
5324 MODULE_PARM_DESC(zfs_arc_meta_prune, "Bytes of meta data to prune");
5325
5326 module_param(zfs_arc_grow_retry, int, 0644);
5327 MODULE_PARM_DESC(zfs_arc_grow_retry, "Seconds before growing arc size");
5328
5329 module_param(zfs_arc_shrink_shift, int, 0644);
5330 MODULE_PARM_DESC(zfs_arc_shrink_shift, "log2(fraction of arc to reclaim)");
5331
5332 module_param(zfs_arc_p_min_shift, int, 0644);
5333 MODULE_PARM_DESC(zfs_arc_p_min_shift, "arc_c shift to calc min/max arc_p");
5334
5335 module_param(zfs_disable_dup_eviction, int, 0644);
5336 MODULE_PARM_DESC(zfs_disable_dup_eviction, "disable duplicate buffer eviction");
5337
5338 module_param(zfs_arc_memory_throttle_disable, int, 0644);
5339 MODULE_PARM_DESC(zfs_arc_memory_throttle_disable, "disable memory throttle");
5340
5341 module_param(zfs_arc_min_prefetch_lifespan, int, 0644);
5342 MODULE_PARM_DESC(zfs_arc_min_prefetch_lifespan, "Min life of prefetch block");
5343
5344 module_param(l2arc_write_max, ulong, 0644);
5345 MODULE_PARM_DESC(l2arc_write_max, "Max write bytes per interval");
5346
5347 module_param(l2arc_write_boost, ulong, 0644);
5348 MODULE_PARM_DESC(l2arc_write_boost, "Extra write bytes during device warmup");
5349
5350 module_param(l2arc_headroom, ulong, 0644);
5351 MODULE_PARM_DESC(l2arc_headroom, "Number of max device writes to precache");
5352
5353 module_param(l2arc_headroom_boost, ulong, 0644);
5354 MODULE_PARM_DESC(l2arc_headroom_boost, "Compressed l2arc_headroom multiplier");
5355
5356 module_param(l2arc_feed_secs, ulong, 0644);
5357 MODULE_PARM_DESC(l2arc_feed_secs, "Seconds between L2ARC writing");
5358
5359 module_param(l2arc_feed_min_ms, ulong, 0644);
5360 MODULE_PARM_DESC(l2arc_feed_min_ms, "Min feed interval in milliseconds");
5361
5362 module_param(l2arc_noprefetch, int, 0644);
5363 MODULE_PARM_DESC(l2arc_noprefetch, "Skip caching prefetched buffers");
5364
5365 module_param(l2arc_nocompress, int, 0644);
5366 MODULE_PARM_DESC(l2arc_nocompress, "Skip compressing L2ARC buffers");
5367
5368 module_param(l2arc_feed_again, int, 0644);
5369 MODULE_PARM_DESC(l2arc_feed_again, "Turbo L2ARC warmup");
5370
5371 module_param(l2arc_norw, int, 0644);
5372 MODULE_PARM_DESC(l2arc_norw, "No reads during writes");
5373
5374 #endif