]> git.proxmox.com Git - mirror_zfs.git/blame - module/zfs/arc.c
Remove double counting HDR_L2ONLY_SIZE
[mirror_zfs.git] / module / zfs / arc.c
CommitLineData
34dc7c2f
BB
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21/*
428870ff 22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
36da08ef
PS
23 * Copyright (c) 2012, Joyent, Inc. All rights reserved.
24 * Copyright (c) 2011, 2015 by Delphix. All rights reserved.
25 * Copyright (c) 2014 by Saso Kiselkov. All rights reserved.
3bec585e 26 * Copyright 2014 Nexenta Systems, Inc. All rights reserved.
34dc7c2f
BB
27 */
28
34dc7c2f
BB
29/*
30 * DVA-based Adjustable Replacement Cache
31 *
32 * While much of the theory of operation used here is
33 * based on the self-tuning, low overhead replacement cache
34 * presented by Megiddo and Modha at FAST 2003, there are some
35 * significant differences:
36 *
37 * 1. The Megiddo and Modha model assumes any page is evictable.
38 * Pages in its cache cannot be "locked" into memory. This makes
39 * the eviction algorithm simple: evict the last page in the list.
40 * This also make the performance characteristics easy to reason
41 * about. Our cache is not so simple. At any given moment, some
42 * subset of the blocks in the cache are un-evictable because we
43 * have handed out a reference to them. Blocks are only evictable
44 * when there are no external references active. This makes
45 * eviction far more problematic: we choose to evict the evictable
46 * blocks that are the "lowest" in the list.
47 *
48 * There are times when it is not possible to evict the requested
49 * space. In these circumstances we are unable to adjust the cache
50 * size. To prevent the cache growing unbounded at these times we
51 * implement a "cache throttle" that slows the flow of new data
52 * into the cache until we can make space available.
53 *
54 * 2. The Megiddo and Modha model assumes a fixed cache size.
55 * Pages are evicted when the cache is full and there is a cache
56 * miss. Our model has a variable sized cache. It grows with
57 * high use, but also tries to react to memory pressure from the
58 * operating system: decreasing its size when system memory is
59 * tight.
60 *
61 * 3. The Megiddo and Modha model assumes a fixed page size. All
d3cc8b15 62 * elements of the cache are therefore exactly the same size. So
34dc7c2f
BB
63 * when adjusting the cache size following a cache miss, its simply
64 * a matter of choosing a single page to evict. In our model, we
65 * have variable sized cache blocks (rangeing from 512 bytes to
d3cc8b15 66 * 128K bytes). We therefore choose a set of blocks to evict to make
34dc7c2f
BB
67 * space for a cache miss that approximates as closely as possible
68 * the space used by the new block.
69 *
70 * See also: "ARC: A Self-Tuning, Low Overhead Replacement Cache"
71 * by N. Megiddo & D. Modha, FAST 2003
72 */
73
74/*
75 * The locking model:
76 *
77 * A new reference to a cache buffer can be obtained in two
78 * ways: 1) via a hash table lookup using the DVA as a key,
79 * or 2) via one of the ARC lists. The arc_read() interface
80 * uses method 1, while the internal arc algorithms for
d3cc8b15 81 * adjusting the cache use method 2. We therefore provide two
34dc7c2f
BB
82 * types of locks: 1) the hash table lock array, and 2) the
83 * arc list locks.
84 *
5c839890
BC
85 * Buffers do not have their own mutexes, rather they rely on the
86 * hash table mutexes for the bulk of their protection (i.e. most
87 * fields in the arc_buf_hdr_t are protected by these mutexes).
34dc7c2f
BB
88 *
89 * buf_hash_find() returns the appropriate mutex (held) when it
90 * locates the requested buffer in the hash table. It returns
91 * NULL for the mutex if the buffer was not in the table.
92 *
93 * buf_hash_remove() expects the appropriate hash mutex to be
94 * already held before it is invoked.
95 *
96 * Each arc state also has a mutex which is used to protect the
97 * buffer list associated with the state. When attempting to
98 * obtain a hash table lock while holding an arc list lock you
99 * must use: mutex_tryenter() to avoid deadlock. Also note that
100 * the active state mutex must be held before the ghost state mutex.
101 *
102 * Arc buffers may have an associated eviction callback function.
103 * This function will be invoked prior to removing the buffer (e.g.
104 * in arc_do_user_evicts()). Note however that the data associated
105 * with the buffer may be evicted prior to the callback. The callback
106 * must be made with *no locks held* (to prevent deadlock). Additionally,
107 * the users of callbacks must ensure that their private data is
bd089c54 108 * protected from simultaneous callbacks from arc_clear_callback()
34dc7c2f
BB
109 * and arc_do_user_evicts().
110 *
ab26409d
BB
111 * It as also possible to register a callback which is run when the
112 * arc_meta_limit is reached and no buffers can be safely evicted. In
113 * this case the arc user should drop a reference on some arc buffers so
114 * they can be reclaimed and the arc_meta_limit honored. For example,
115 * when using the ZPL each dentry holds a references on a znode. These
116 * dentries must be pruned before the arc buffer holding the znode can
117 * be safely evicted.
118 *
34dc7c2f
BB
119 * Note that the majority of the performance stats are manipulated
120 * with atomic operations.
121 *
b9541d6b 122 * The L2ARC uses the l2ad_mtx on each vdev for the following:
34dc7c2f
BB
123 *
124 * - L2ARC buflist creation
125 * - L2ARC buflist eviction
126 * - L2ARC write completion, which walks L2ARC buflists
127 * - ARC header destruction, as it removes from L2ARC buflists
128 * - ARC header release, as it removes from L2ARC buflists
129 */
130
131#include <sys/spa.h>
132#include <sys/zio.h>
3a17a7a9 133#include <sys/zio_compress.h>
34dc7c2f
BB
134#include <sys/zfs_context.h>
135#include <sys/arc.h>
36da08ef 136#include <sys/refcount.h>
b128c09f 137#include <sys/vdev.h>
9babb374 138#include <sys/vdev_impl.h>
e8b96c60 139#include <sys/dsl_pool.h>
ca0bf58d 140#include <sys/multilist.h>
34dc7c2f
BB
141#ifdef _KERNEL
142#include <sys/vmsystm.h>
143#include <vm/anon.h>
144#include <sys/fs/swapnode.h>
ab26409d 145#include <sys/zpl.h>
aaed7c40 146#include <linux/mm_compat.h>
34dc7c2f
BB
147#endif
148#include <sys/callb.h>
149#include <sys/kstat.h>
570827e1 150#include <sys/dmu_tx.h>
428870ff 151#include <zfs_fletcher.h>
59ec819a 152#include <sys/arc_impl.h>
49ee64e5 153#include <sys/trace_arc.h>
34dc7c2f 154
498877ba
MA
155#ifndef _KERNEL
156/* set with ZFS_DEBUG=watch, to enable watchpoints on frozen buffers */
157boolean_t arc_watch = B_FALSE;
158#endif
159
ca0bf58d
PS
160static kmutex_t arc_reclaim_lock;
161static kcondvar_t arc_reclaim_thread_cv;
162static boolean_t arc_reclaim_thread_exit;
163static kcondvar_t arc_reclaim_waiters_cv;
164
165static kmutex_t arc_user_evicts_lock;
166static kcondvar_t arc_user_evicts_cv;
167static boolean_t arc_user_evicts_thread_exit;
34dc7c2f 168
e8b96c60 169/*
ca0bf58d
PS
170 * The number of headers to evict in arc_evict_state_impl() before
171 * dropping the sublist lock and evicting from another sublist. A lower
172 * value means we're more likely to evict the "correct" header (i.e. the
173 * oldest header in the arc state), but comes with higher overhead
174 * (i.e. more invocations of arc_evict_state_impl()).
175 */
176int zfs_arc_evict_batch_limit = 10;
177
178/*
179 * The number of sublists used for each of the arc state lists. If this
180 * is not set to a suitable value by the user, it will be configured to
181 * the number of CPUs on the system in arc_init().
e8b96c60 182 */
ca0bf58d 183int zfs_arc_num_sublists_per_state = 0;
e8b96c60 184
34dc7c2f 185/* number of seconds before growing cache again */
ca67b33a 186static int arc_grow_retry = 5;
34dc7c2f 187
ca0bf58d 188/* shift of arc_c for calculating overflow limit in arc_get_data_buf */
ca67b33a 189int zfs_arc_overflow_shift = 8;
62422785 190
728d6ae9
BB
191/* shift of arc_c for calculating both min and max arc_p */
192static int arc_p_min_shift = 4;
193
d164b209 194/* log2(fraction of arc to reclaim) */
ca67b33a 195static int arc_shrink_shift = 7;
d164b209 196
34dc7c2f 197/*
ca67b33a
MA
198 * log2(fraction of ARC which must be free to allow growing).
199 * I.e. If there is less than arc_c >> arc_no_grow_shift free memory,
200 * when reading a new block into the ARC, we will evict an equal-sized block
201 * from the ARC.
202 *
203 * This must be less than arc_shrink_shift, so that when we shrink the ARC,
204 * we will still not allow it to grow.
34dc7c2f 205 */
ca67b33a 206int arc_no_grow_shift = 5;
bce45ec9 207
49ddb315 208
ca0bf58d
PS
209/*
210 * minimum lifespan of a prefetch block in clock ticks
211 * (initialized in arc_init())
212 */
ca67b33a 213static int arc_min_prefetch_lifespan;
ca0bf58d 214
e8b96c60
MA
215/*
216 * If this percent of memory is free, don't throttle.
217 */
218int arc_lotsfree_percent = 10;
219
34dc7c2f
BB
220static int arc_dead;
221
b128c09f
BB
222/*
223 * The arc has filled available memory and has now warmed up.
224 */
225static boolean_t arc_warm;
226
34dc7c2f
BB
227/*
228 * These tunables are for performance analysis.
229 */
c28b2279
BB
230unsigned long zfs_arc_max = 0;
231unsigned long zfs_arc_min = 0;
232unsigned long zfs_arc_meta_limit = 0;
ca0bf58d 233unsigned long zfs_arc_meta_min = 0;
ca67b33a
MA
234int zfs_arc_grow_retry = 0;
235int zfs_arc_shrink_shift = 0;
728d6ae9 236int zfs_arc_p_min_shift = 0;
ca67b33a
MA
237int zfs_disable_dup_eviction = 0;
238int zfs_arc_average_blocksize = 8 * 1024; /* 8KB */
34dc7c2f 239
bc888666 240/*
ca67b33a 241 * These tunables are Linux specific
bc888666 242 */
ca67b33a
MA
243int zfs_arc_memory_throttle_disable = 1;
244int zfs_arc_min_prefetch_lifespan = 0;
245int zfs_arc_p_aggressive_disable = 1;
246int zfs_arc_p_dampener_disable = 1;
247int zfs_arc_meta_prune = 10000;
248int zfs_arc_meta_strategy = ARC_STRATEGY_META_BALANCED;
249int zfs_arc_meta_adjust_restarts = 4096;
bc888666 250
34dc7c2f
BB
251/* The 6 states: */
252static arc_state_t ARC_anon;
253static arc_state_t ARC_mru;
254static arc_state_t ARC_mru_ghost;
255static arc_state_t ARC_mfu;
256static arc_state_t ARC_mfu_ghost;
257static arc_state_t ARC_l2c_only;
258
259typedef struct arc_stats {
260 kstat_named_t arcstat_hits;
261 kstat_named_t arcstat_misses;
262 kstat_named_t arcstat_demand_data_hits;
263 kstat_named_t arcstat_demand_data_misses;
264 kstat_named_t arcstat_demand_metadata_hits;
265 kstat_named_t arcstat_demand_metadata_misses;
266 kstat_named_t arcstat_prefetch_data_hits;
267 kstat_named_t arcstat_prefetch_data_misses;
268 kstat_named_t arcstat_prefetch_metadata_hits;
269 kstat_named_t arcstat_prefetch_metadata_misses;
270 kstat_named_t arcstat_mru_hits;
271 kstat_named_t arcstat_mru_ghost_hits;
272 kstat_named_t arcstat_mfu_hits;
273 kstat_named_t arcstat_mfu_ghost_hits;
274 kstat_named_t arcstat_deleted;
e49f1e20
WA
275 /*
276 * Number of buffers that could not be evicted because the hash lock
277 * was held by another thread. The lock may not necessarily be held
278 * by something using the same buffer, since hash locks are shared
279 * by multiple buffers.
280 */
34dc7c2f 281 kstat_named_t arcstat_mutex_miss;
e49f1e20
WA
282 /*
283 * Number of buffers skipped because they have I/O in progress, are
284 * indrect prefetch buffers that have not lived long enough, or are
285 * not from the spa we're trying to evict from.
286 */
34dc7c2f 287 kstat_named_t arcstat_evict_skip;
ca0bf58d
PS
288 /*
289 * Number of times arc_evict_state() was unable to evict enough
290 * buffers to reach its target amount.
291 */
292 kstat_named_t arcstat_evict_not_enough;
428870ff
BB
293 kstat_named_t arcstat_evict_l2_cached;
294 kstat_named_t arcstat_evict_l2_eligible;
295 kstat_named_t arcstat_evict_l2_ineligible;
ca0bf58d 296 kstat_named_t arcstat_evict_l2_skip;
34dc7c2f
BB
297 kstat_named_t arcstat_hash_elements;
298 kstat_named_t arcstat_hash_elements_max;
299 kstat_named_t arcstat_hash_collisions;
300 kstat_named_t arcstat_hash_chains;
301 kstat_named_t arcstat_hash_chain_max;
302 kstat_named_t arcstat_p;
303 kstat_named_t arcstat_c;
304 kstat_named_t arcstat_c_min;
305 kstat_named_t arcstat_c_max;
306 kstat_named_t arcstat_size;
500445c0
PS
307 /*
308 * Number of bytes consumed by internal ARC structures necessary
309 * for tracking purposes; these structures are not actually
310 * backed by ARC buffers. This includes arc_buf_hdr_t structures
311 * (allocated via arc_buf_hdr_t_full and arc_buf_hdr_t_l2only
312 * caches), and arc_buf_t structures (allocated via arc_buf_t
313 * cache).
314 */
34dc7c2f 315 kstat_named_t arcstat_hdr_size;
500445c0
PS
316 /*
317 * Number of bytes consumed by ARC buffers of type equal to
318 * ARC_BUFC_DATA. This is generally consumed by buffers backing
319 * on disk user data (e.g. plain file contents).
320 */
d164b209 321 kstat_named_t arcstat_data_size;
500445c0
PS
322 /*
323 * Number of bytes consumed by ARC buffers of type equal to
324 * ARC_BUFC_METADATA. This is generally consumed by buffers
325 * backing on disk data that is used for internal ZFS
326 * structures (e.g. ZAP, dnode, indirect blocks, etc).
327 */
328 kstat_named_t arcstat_metadata_size;
329 /*
330 * Number of bytes consumed by various buffers and structures
331 * not actually backed with ARC buffers. This includes bonus
332 * buffers (allocated directly via zio_buf_* functions),
333 * dmu_buf_impl_t structures (allocated via dmu_buf_impl_t
334 * cache), and dnode_t structures (allocated via dnode_t cache).
335 */
d164b209 336 kstat_named_t arcstat_other_size;
500445c0
PS
337 /*
338 * Total number of bytes consumed by ARC buffers residing in the
339 * arc_anon state. This includes *all* buffers in the arc_anon
340 * state; e.g. data, metadata, evictable, and unevictable buffers
341 * are all included in this value.
342 */
13be560d 343 kstat_named_t arcstat_anon_size;
500445c0
PS
344 /*
345 * Number of bytes consumed by ARC buffers that meet the
346 * following criteria: backing buffers of type ARC_BUFC_DATA,
347 * residing in the arc_anon state, and are eligible for eviction
348 * (e.g. have no outstanding holds on the buffer).
349 */
350 kstat_named_t arcstat_anon_evictable_data;
351 /*
352 * Number of bytes consumed by ARC buffers that meet the
353 * following criteria: backing buffers of type ARC_BUFC_METADATA,
354 * residing in the arc_anon state, and are eligible for eviction
355 * (e.g. have no outstanding holds on the buffer).
356 */
357 kstat_named_t arcstat_anon_evictable_metadata;
358 /*
359 * Total number of bytes consumed by ARC buffers residing in the
360 * arc_mru state. This includes *all* buffers in the arc_mru
361 * state; e.g. data, metadata, evictable, and unevictable buffers
362 * are all included in this value.
363 */
13be560d 364 kstat_named_t arcstat_mru_size;
500445c0
PS
365 /*
366 * Number of bytes consumed by ARC buffers that meet the
367 * following criteria: backing buffers of type ARC_BUFC_DATA,
368 * residing in the arc_mru state, and are eligible for eviction
369 * (e.g. have no outstanding holds on the buffer).
370 */
371 kstat_named_t arcstat_mru_evictable_data;
372 /*
373 * Number of bytes consumed by ARC buffers that meet the
374 * following criteria: backing buffers of type ARC_BUFC_METADATA,
375 * residing in the arc_mru state, and are eligible for eviction
376 * (e.g. have no outstanding holds on the buffer).
377 */
378 kstat_named_t arcstat_mru_evictable_metadata;
379 /*
380 * Total number of bytes that *would have been* consumed by ARC
381 * buffers in the arc_mru_ghost state. The key thing to note
382 * here, is the fact that this size doesn't actually indicate
383 * RAM consumption. The ghost lists only consist of headers and
384 * don't actually have ARC buffers linked off of these headers.
385 * Thus, *if* the headers had associated ARC buffers, these
386 * buffers *would have* consumed this number of bytes.
387 */
13be560d 388 kstat_named_t arcstat_mru_ghost_size;
500445c0
PS
389 /*
390 * Number of bytes that *would have been* consumed by ARC
391 * buffers that are eligible for eviction, of type
392 * ARC_BUFC_DATA, and linked off the arc_mru_ghost state.
393 */
394 kstat_named_t arcstat_mru_ghost_evictable_data;
395 /*
396 * Number of bytes that *would have been* consumed by ARC
397 * buffers that are eligible for eviction, of type
398 * ARC_BUFC_METADATA, and linked off the arc_mru_ghost state.
399 */
400 kstat_named_t arcstat_mru_ghost_evictable_metadata;
401 /*
402 * Total number of bytes consumed by ARC buffers residing in the
403 * arc_mfu state. This includes *all* buffers in the arc_mfu
404 * state; e.g. data, metadata, evictable, and unevictable buffers
405 * are all included in this value.
406 */
13be560d 407 kstat_named_t arcstat_mfu_size;
500445c0
PS
408 /*
409 * Number of bytes consumed by ARC buffers that are eligible for
410 * eviction, of type ARC_BUFC_DATA, and reside in the arc_mfu
411 * state.
412 */
413 kstat_named_t arcstat_mfu_evictable_data;
414 /*
415 * Number of bytes consumed by ARC buffers that are eligible for
416 * eviction, of type ARC_BUFC_METADATA, and reside in the
417 * arc_mfu state.
418 */
419 kstat_named_t arcstat_mfu_evictable_metadata;
420 /*
421 * Total number of bytes that *would have been* consumed by ARC
422 * buffers in the arc_mfu_ghost state. See the comment above
423 * arcstat_mru_ghost_size for more details.
424 */
13be560d 425 kstat_named_t arcstat_mfu_ghost_size;
500445c0
PS
426 /*
427 * Number of bytes that *would have been* consumed by ARC
428 * buffers that are eligible for eviction, of type
429 * ARC_BUFC_DATA, and linked off the arc_mfu_ghost state.
430 */
431 kstat_named_t arcstat_mfu_ghost_evictable_data;
432 /*
433 * Number of bytes that *would have been* consumed by ARC
434 * buffers that are eligible for eviction, of type
435 * ARC_BUFC_METADATA, and linked off the arc_mru_ghost state.
436 */
437 kstat_named_t arcstat_mfu_ghost_evictable_metadata;
34dc7c2f
BB
438 kstat_named_t arcstat_l2_hits;
439 kstat_named_t arcstat_l2_misses;
440 kstat_named_t arcstat_l2_feeds;
441 kstat_named_t arcstat_l2_rw_clash;
d164b209
BB
442 kstat_named_t arcstat_l2_read_bytes;
443 kstat_named_t arcstat_l2_write_bytes;
34dc7c2f
BB
444 kstat_named_t arcstat_l2_writes_sent;
445 kstat_named_t arcstat_l2_writes_done;
446 kstat_named_t arcstat_l2_writes_error;
ca0bf58d 447 kstat_named_t arcstat_l2_writes_lock_retry;
34dc7c2f
BB
448 kstat_named_t arcstat_l2_evict_lock_retry;
449 kstat_named_t arcstat_l2_evict_reading;
b9541d6b 450 kstat_named_t arcstat_l2_evict_l1cached;
34dc7c2f 451 kstat_named_t arcstat_l2_free_on_write;
ca0bf58d 452 kstat_named_t arcstat_l2_cdata_free_on_write;
34dc7c2f
BB
453 kstat_named_t arcstat_l2_abort_lowmem;
454 kstat_named_t arcstat_l2_cksum_bad;
455 kstat_named_t arcstat_l2_io_error;
456 kstat_named_t arcstat_l2_size;
3a17a7a9 457 kstat_named_t arcstat_l2_asize;
34dc7c2f 458 kstat_named_t arcstat_l2_hdr_size;
3a17a7a9
SK
459 kstat_named_t arcstat_l2_compress_successes;
460 kstat_named_t arcstat_l2_compress_zeros;
461 kstat_named_t arcstat_l2_compress_failures;
34dc7c2f 462 kstat_named_t arcstat_memory_throttle_count;
1eb5bfa3
GW
463 kstat_named_t arcstat_duplicate_buffers;
464 kstat_named_t arcstat_duplicate_buffers_size;
465 kstat_named_t arcstat_duplicate_reads;
7cb67b45
BB
466 kstat_named_t arcstat_memory_direct_count;
467 kstat_named_t arcstat_memory_indirect_count;
1834f2d8
BB
468 kstat_named_t arcstat_no_grow;
469 kstat_named_t arcstat_tempreserve;
470 kstat_named_t arcstat_loaned_bytes;
ab26409d 471 kstat_named_t arcstat_prune;
1834f2d8
BB
472 kstat_named_t arcstat_meta_used;
473 kstat_named_t arcstat_meta_limit;
474 kstat_named_t arcstat_meta_max;
ca0bf58d 475 kstat_named_t arcstat_meta_min;
34dc7c2f
BB
476} arc_stats_t;
477
478static arc_stats_t arc_stats = {
479 { "hits", KSTAT_DATA_UINT64 },
480 { "misses", KSTAT_DATA_UINT64 },
481 { "demand_data_hits", KSTAT_DATA_UINT64 },
482 { "demand_data_misses", KSTAT_DATA_UINT64 },
483 { "demand_metadata_hits", KSTAT_DATA_UINT64 },
484 { "demand_metadata_misses", KSTAT_DATA_UINT64 },
485 { "prefetch_data_hits", KSTAT_DATA_UINT64 },
486 { "prefetch_data_misses", KSTAT_DATA_UINT64 },
487 { "prefetch_metadata_hits", KSTAT_DATA_UINT64 },
488 { "prefetch_metadata_misses", KSTAT_DATA_UINT64 },
489 { "mru_hits", KSTAT_DATA_UINT64 },
490 { "mru_ghost_hits", KSTAT_DATA_UINT64 },
491 { "mfu_hits", KSTAT_DATA_UINT64 },
492 { "mfu_ghost_hits", KSTAT_DATA_UINT64 },
493 { "deleted", KSTAT_DATA_UINT64 },
34dc7c2f
BB
494 { "mutex_miss", KSTAT_DATA_UINT64 },
495 { "evict_skip", KSTAT_DATA_UINT64 },
ca0bf58d 496 { "evict_not_enough", KSTAT_DATA_UINT64 },
428870ff
BB
497 { "evict_l2_cached", KSTAT_DATA_UINT64 },
498 { "evict_l2_eligible", KSTAT_DATA_UINT64 },
499 { "evict_l2_ineligible", KSTAT_DATA_UINT64 },
ca0bf58d 500 { "evict_l2_skip", KSTAT_DATA_UINT64 },
34dc7c2f
BB
501 { "hash_elements", KSTAT_DATA_UINT64 },
502 { "hash_elements_max", KSTAT_DATA_UINT64 },
503 { "hash_collisions", KSTAT_DATA_UINT64 },
504 { "hash_chains", KSTAT_DATA_UINT64 },
505 { "hash_chain_max", KSTAT_DATA_UINT64 },
506 { "p", KSTAT_DATA_UINT64 },
507 { "c", KSTAT_DATA_UINT64 },
508 { "c_min", KSTAT_DATA_UINT64 },
509 { "c_max", KSTAT_DATA_UINT64 },
510 { "size", KSTAT_DATA_UINT64 },
511 { "hdr_size", KSTAT_DATA_UINT64 },
d164b209 512 { "data_size", KSTAT_DATA_UINT64 },
500445c0 513 { "metadata_size", KSTAT_DATA_UINT64 },
d164b209 514 { "other_size", KSTAT_DATA_UINT64 },
13be560d 515 { "anon_size", KSTAT_DATA_UINT64 },
500445c0
PS
516 { "anon_evictable_data", KSTAT_DATA_UINT64 },
517 { "anon_evictable_metadata", KSTAT_DATA_UINT64 },
13be560d 518 { "mru_size", KSTAT_DATA_UINT64 },
500445c0
PS
519 { "mru_evictable_data", KSTAT_DATA_UINT64 },
520 { "mru_evictable_metadata", KSTAT_DATA_UINT64 },
13be560d 521 { "mru_ghost_size", KSTAT_DATA_UINT64 },
500445c0
PS
522 { "mru_ghost_evictable_data", KSTAT_DATA_UINT64 },
523 { "mru_ghost_evictable_metadata", KSTAT_DATA_UINT64 },
13be560d 524 { "mfu_size", KSTAT_DATA_UINT64 },
500445c0
PS
525 { "mfu_evictable_data", KSTAT_DATA_UINT64 },
526 { "mfu_evictable_metadata", KSTAT_DATA_UINT64 },
13be560d 527 { "mfu_ghost_size", KSTAT_DATA_UINT64 },
500445c0
PS
528 { "mfu_ghost_evictable_data", KSTAT_DATA_UINT64 },
529 { "mfu_ghost_evictable_metadata", KSTAT_DATA_UINT64 },
34dc7c2f
BB
530 { "l2_hits", KSTAT_DATA_UINT64 },
531 { "l2_misses", KSTAT_DATA_UINT64 },
532 { "l2_feeds", KSTAT_DATA_UINT64 },
533 { "l2_rw_clash", KSTAT_DATA_UINT64 },
d164b209
BB
534 { "l2_read_bytes", KSTAT_DATA_UINT64 },
535 { "l2_write_bytes", KSTAT_DATA_UINT64 },
34dc7c2f
BB
536 { "l2_writes_sent", KSTAT_DATA_UINT64 },
537 { "l2_writes_done", KSTAT_DATA_UINT64 },
538 { "l2_writes_error", KSTAT_DATA_UINT64 },
ca0bf58d 539 { "l2_writes_lock_retry", KSTAT_DATA_UINT64 },
34dc7c2f
BB
540 { "l2_evict_lock_retry", KSTAT_DATA_UINT64 },
541 { "l2_evict_reading", KSTAT_DATA_UINT64 },
b9541d6b 542 { "l2_evict_l1cached", KSTAT_DATA_UINT64 },
34dc7c2f 543 { "l2_free_on_write", KSTAT_DATA_UINT64 },
ca0bf58d 544 { "l2_cdata_free_on_write", KSTAT_DATA_UINT64 },
34dc7c2f
BB
545 { "l2_abort_lowmem", KSTAT_DATA_UINT64 },
546 { "l2_cksum_bad", KSTAT_DATA_UINT64 },
547 { "l2_io_error", KSTAT_DATA_UINT64 },
548 { "l2_size", KSTAT_DATA_UINT64 },
3a17a7a9 549 { "l2_asize", KSTAT_DATA_UINT64 },
34dc7c2f 550 { "l2_hdr_size", KSTAT_DATA_UINT64 },
3a17a7a9
SK
551 { "l2_compress_successes", KSTAT_DATA_UINT64 },
552 { "l2_compress_zeros", KSTAT_DATA_UINT64 },
553 { "l2_compress_failures", KSTAT_DATA_UINT64 },
1834f2d8 554 { "memory_throttle_count", KSTAT_DATA_UINT64 },
1eb5bfa3
GW
555 { "duplicate_buffers", KSTAT_DATA_UINT64 },
556 { "duplicate_buffers_size", KSTAT_DATA_UINT64 },
557 { "duplicate_reads", KSTAT_DATA_UINT64 },
7cb67b45
BB
558 { "memory_direct_count", KSTAT_DATA_UINT64 },
559 { "memory_indirect_count", KSTAT_DATA_UINT64 },
1834f2d8
BB
560 { "arc_no_grow", KSTAT_DATA_UINT64 },
561 { "arc_tempreserve", KSTAT_DATA_UINT64 },
562 { "arc_loaned_bytes", KSTAT_DATA_UINT64 },
ab26409d 563 { "arc_prune", KSTAT_DATA_UINT64 },
1834f2d8
BB
564 { "arc_meta_used", KSTAT_DATA_UINT64 },
565 { "arc_meta_limit", KSTAT_DATA_UINT64 },
566 { "arc_meta_max", KSTAT_DATA_UINT64 },
500445c0 567 { "arc_meta_min", KSTAT_DATA_UINT64 }
34dc7c2f
BB
568};
569
570#define ARCSTAT(stat) (arc_stats.stat.value.ui64)
571
572#define ARCSTAT_INCR(stat, val) \
d3cc8b15 573 atomic_add_64(&arc_stats.stat.value.ui64, (val))
34dc7c2f 574
428870ff 575#define ARCSTAT_BUMP(stat) ARCSTAT_INCR(stat, 1)
34dc7c2f
BB
576#define ARCSTAT_BUMPDOWN(stat) ARCSTAT_INCR(stat, -1)
577
578#define ARCSTAT_MAX(stat, val) { \
579 uint64_t m; \
580 while ((val) > (m = arc_stats.stat.value.ui64) && \
581 (m != atomic_cas_64(&arc_stats.stat.value.ui64, m, (val)))) \
582 continue; \
583}
584
585#define ARCSTAT_MAXSTAT(stat) \
586 ARCSTAT_MAX(stat##_max, arc_stats.stat.value.ui64)
587
588/*
589 * We define a macro to allow ARC hits/misses to be easily broken down by
590 * two separate conditions, giving a total of four different subtypes for
591 * each of hits and misses (so eight statistics total).
592 */
593#define ARCSTAT_CONDSTAT(cond1, stat1, notstat1, cond2, stat2, notstat2, stat) \
594 if (cond1) { \
595 if (cond2) { \
596 ARCSTAT_BUMP(arcstat_##stat1##_##stat2##_##stat); \
597 } else { \
598 ARCSTAT_BUMP(arcstat_##stat1##_##notstat2##_##stat); \
599 } \
600 } else { \
601 if (cond2) { \
602 ARCSTAT_BUMP(arcstat_##notstat1##_##stat2##_##stat); \
603 } else { \
604 ARCSTAT_BUMP(arcstat_##notstat1##_##notstat2##_##stat);\
605 } \
606 }
607
608kstat_t *arc_ksp;
428870ff 609static arc_state_t *arc_anon;
34dc7c2f
BB
610static arc_state_t *arc_mru;
611static arc_state_t *arc_mru_ghost;
612static arc_state_t *arc_mfu;
613static arc_state_t *arc_mfu_ghost;
614static arc_state_t *arc_l2c_only;
615
616/*
617 * There are several ARC variables that are critical to export as kstats --
618 * but we don't want to have to grovel around in the kstat whenever we wish to
619 * manipulate them. For these variables, we therefore define them to be in
620 * terms of the statistic variable. This assures that we are not introducing
621 * the possibility of inconsistency by having shadow copies of the variables,
622 * while still allowing the code to be readable.
623 */
624#define arc_size ARCSTAT(arcstat_size) /* actual total arc size */
625#define arc_p ARCSTAT(arcstat_p) /* target size of MRU */
626#define arc_c ARCSTAT(arcstat_c) /* target size of cache */
627#define arc_c_min ARCSTAT(arcstat_c_min) /* min target cache size */
628#define arc_c_max ARCSTAT(arcstat_c_max) /* max target cache size */
1834f2d8
BB
629#define arc_no_grow ARCSTAT(arcstat_no_grow)
630#define arc_tempreserve ARCSTAT(arcstat_tempreserve)
631#define arc_loaned_bytes ARCSTAT(arcstat_loaned_bytes)
23c0a133 632#define arc_meta_limit ARCSTAT(arcstat_meta_limit) /* max size for metadata */
ca0bf58d 633#define arc_meta_min ARCSTAT(arcstat_meta_min) /* min size for metadata */
23c0a133
GW
634#define arc_meta_used ARCSTAT(arcstat_meta_used) /* size of metadata */
635#define arc_meta_max ARCSTAT(arcstat_meta_max) /* max size of metadata */
34dc7c2f 636
3a17a7a9
SK
637#define L2ARC_IS_VALID_COMPRESS(_c_) \
638 ((_c_) == ZIO_COMPRESS_LZ4 || (_c_) == ZIO_COMPRESS_EMPTY)
639
ab26409d
BB
640static list_t arc_prune_list;
641static kmutex_t arc_prune_mtx;
f6046738 642static taskq_t *arc_prune_taskq;
34dc7c2f 643static arc_buf_t *arc_eviction_list;
34dc7c2f 644static arc_buf_hdr_t arc_eviction_hdr;
428870ff 645
34dc7c2f
BB
646#define GHOST_STATE(state) \
647 ((state) == arc_mru_ghost || (state) == arc_mfu_ghost || \
648 (state) == arc_l2c_only)
649
2a432414
GW
650#define HDR_IN_HASH_TABLE(hdr) ((hdr)->b_flags & ARC_FLAG_IN_HASH_TABLE)
651#define HDR_IO_IN_PROGRESS(hdr) ((hdr)->b_flags & ARC_FLAG_IO_IN_PROGRESS)
652#define HDR_IO_ERROR(hdr) ((hdr)->b_flags & ARC_FLAG_IO_ERROR)
653#define HDR_PREFETCH(hdr) ((hdr)->b_flags & ARC_FLAG_PREFETCH)
654#define HDR_FREED_IN_READ(hdr) ((hdr)->b_flags & ARC_FLAG_FREED_IN_READ)
655#define HDR_BUF_AVAILABLE(hdr) ((hdr)->b_flags & ARC_FLAG_BUF_AVAILABLE)
b9541d6b 656
2a432414 657#define HDR_L2CACHE(hdr) ((hdr)->b_flags & ARC_FLAG_L2CACHE)
b9541d6b 658#define HDR_L2COMPRESS(hdr) ((hdr)->b_flags & ARC_FLAG_L2COMPRESS)
2a432414 659#define HDR_L2_READING(hdr) \
b9541d6b
CW
660 (((hdr)->b_flags & ARC_FLAG_IO_IN_PROGRESS) && \
661 ((hdr)->b_flags & ARC_FLAG_HAS_L2HDR))
2a432414
GW
662#define HDR_L2_WRITING(hdr) ((hdr)->b_flags & ARC_FLAG_L2_WRITING)
663#define HDR_L2_EVICTED(hdr) ((hdr)->b_flags & ARC_FLAG_L2_EVICTED)
664#define HDR_L2_WRITE_HEAD(hdr) ((hdr)->b_flags & ARC_FLAG_L2_WRITE_HEAD)
34dc7c2f 665
b9541d6b
CW
666#define HDR_ISTYPE_METADATA(hdr) \
667 ((hdr)->b_flags & ARC_FLAG_BUFC_METADATA)
668#define HDR_ISTYPE_DATA(hdr) (!HDR_ISTYPE_METADATA(hdr))
669
670#define HDR_HAS_L1HDR(hdr) ((hdr)->b_flags & ARC_FLAG_HAS_L1HDR)
671#define HDR_HAS_L2HDR(hdr) ((hdr)->b_flags & ARC_FLAG_HAS_L2HDR)
672
673/* For storing compression mode in b_flags */
674#define HDR_COMPRESS_OFFSET 24
675#define HDR_COMPRESS_NBITS 7
676
677#define HDR_GET_COMPRESS(hdr) ((enum zio_compress)BF32_GET(hdr->b_flags, \
678 HDR_COMPRESS_OFFSET, HDR_COMPRESS_NBITS))
679#define HDR_SET_COMPRESS(hdr, cmp) BF32_SET(hdr->b_flags, \
680 HDR_COMPRESS_OFFSET, HDR_COMPRESS_NBITS, (cmp))
681
34dc7c2f
BB
682/*
683 * Other sizes
684 */
685
b9541d6b
CW
686#define HDR_FULL_SIZE ((int64_t)sizeof (arc_buf_hdr_t))
687#define HDR_L2ONLY_SIZE ((int64_t)offsetof(arc_buf_hdr_t, b_l1hdr))
34dc7c2f
BB
688
689/*
690 * Hash table routines
691 */
692
00b46022
BB
693#define HT_LOCK_ALIGN 64
694#define HT_LOCK_PAD (P2NPHASE(sizeof (kmutex_t), (HT_LOCK_ALIGN)))
34dc7c2f
BB
695
696struct ht_lock {
697 kmutex_t ht_lock;
698#ifdef _KERNEL
00b46022 699 unsigned char pad[HT_LOCK_PAD];
34dc7c2f
BB
700#endif
701};
702
b31d8ea7 703#define BUF_LOCKS 8192
34dc7c2f
BB
704typedef struct buf_hash_table {
705 uint64_t ht_mask;
706 arc_buf_hdr_t **ht_table;
707 struct ht_lock ht_locks[BUF_LOCKS];
708} buf_hash_table_t;
709
710static buf_hash_table_t buf_hash_table;
711
712#define BUF_HASH_INDEX(spa, dva, birth) \
713 (buf_hash(spa, dva, birth) & buf_hash_table.ht_mask)
714#define BUF_HASH_LOCK_NTRY(idx) (buf_hash_table.ht_locks[idx & (BUF_LOCKS-1)])
715#define BUF_HASH_LOCK(idx) (&(BUF_HASH_LOCK_NTRY(idx).ht_lock))
428870ff
BB
716#define HDR_LOCK(hdr) \
717 (BUF_HASH_LOCK(BUF_HASH_INDEX(hdr->b_spa, &hdr->b_dva, hdr->b_birth)))
34dc7c2f
BB
718
719uint64_t zfs_crc64_table[256];
720
721/*
722 * Level 2 ARC
723 */
724
725#define L2ARC_WRITE_SIZE (8 * 1024 * 1024) /* initial write max */
3a17a7a9
SK
726#define L2ARC_HEADROOM 2 /* num of writes */
727/*
728 * If we discover during ARC scan any buffers to be compressed, we boost
729 * our headroom for the next scanning cycle by this percentage multiple.
730 */
731#define L2ARC_HEADROOM_BOOST 200
d164b209
BB
732#define L2ARC_FEED_SECS 1 /* caching interval secs */
733#define L2ARC_FEED_MIN_MS 200 /* min caching interval ms */
34dc7c2f 734
d962d5da
PS
735/*
736 * Used to distinguish headers that are being process by
737 * l2arc_write_buffers(), but have yet to be assigned to a l2arc disk
738 * address. This can happen when the header is added to the l2arc's list
739 * of buffers to write in the first stage of l2arc_write_buffers(), but
740 * has not yet been written out which happens in the second stage of
741 * l2arc_write_buffers().
742 */
743#define L2ARC_ADDR_UNSET ((uint64_t)(-1))
744
34dc7c2f
BB
745#define l2arc_writes_sent ARCSTAT(arcstat_l2_writes_sent)
746#define l2arc_writes_done ARCSTAT(arcstat_l2_writes_done)
747
d3cc8b15 748/* L2ARC Performance Tunables */
abd8610c
BB
749unsigned long l2arc_write_max = L2ARC_WRITE_SIZE; /* def max write size */
750unsigned long l2arc_write_boost = L2ARC_WRITE_SIZE; /* extra warmup write */
751unsigned long l2arc_headroom = L2ARC_HEADROOM; /* # of dev writes */
3a17a7a9 752unsigned long l2arc_headroom_boost = L2ARC_HEADROOM_BOOST;
abd8610c
BB
753unsigned long l2arc_feed_secs = L2ARC_FEED_SECS; /* interval seconds */
754unsigned long l2arc_feed_min_ms = L2ARC_FEED_MIN_MS; /* min interval msecs */
755int l2arc_noprefetch = B_TRUE; /* don't cache prefetch bufs */
3a17a7a9 756int l2arc_nocompress = B_FALSE; /* don't compress bufs */
abd8610c 757int l2arc_feed_again = B_TRUE; /* turbo warmup */
c93504f0 758int l2arc_norw = B_FALSE; /* no reads during writes */
34dc7c2f
BB
759
760/*
761 * L2ARC Internals
762 */
34dc7c2f
BB
763static list_t L2ARC_dev_list; /* device list */
764static list_t *l2arc_dev_list; /* device list pointer */
765static kmutex_t l2arc_dev_mtx; /* device list mutex */
766static l2arc_dev_t *l2arc_dev_last; /* last device used */
34dc7c2f
BB
767static list_t L2ARC_free_on_write; /* free after write buf list */
768static list_t *l2arc_free_on_write; /* free after write list ptr */
769static kmutex_t l2arc_free_on_write_mtx; /* mutex for list */
770static uint64_t l2arc_ndev; /* number of devices */
771
772typedef struct l2arc_read_callback {
3a17a7a9
SK
773 arc_buf_t *l2rcb_buf; /* read buffer */
774 spa_t *l2rcb_spa; /* spa */
775 blkptr_t l2rcb_bp; /* original blkptr */
5dbd68a3 776 zbookmark_phys_t l2rcb_zb; /* original bookmark */
3a17a7a9
SK
777 int l2rcb_flags; /* original flags */
778 enum zio_compress l2rcb_compress; /* applied compress */
34dc7c2f
BB
779} l2arc_read_callback_t;
780
34dc7c2f
BB
781typedef struct l2arc_data_free {
782 /* protected by l2arc_free_on_write_mtx */
783 void *l2df_data;
784 size_t l2df_size;
785 void (*l2df_func)(void *, size_t);
786 list_node_t l2df_list_node;
787} l2arc_data_free_t;
788
789static kmutex_t l2arc_feed_thr_lock;
790static kcondvar_t l2arc_feed_thr_cv;
791static uint8_t l2arc_thread_exit;
792
2a432414
GW
793static void arc_get_data_buf(arc_buf_t *);
794static void arc_access(arc_buf_hdr_t *, kmutex_t *);
ca0bf58d 795static boolean_t arc_is_overflowing(void);
2a432414 796static void arc_buf_watch(arc_buf_t *);
ca67b33a 797static void arc_tuning_update(void);
2a432414 798
b9541d6b
CW
799static arc_buf_contents_t arc_buf_type(arc_buf_hdr_t *);
800static uint32_t arc_bufc_to_flags(arc_buf_contents_t);
801
2a432414
GW
802static boolean_t l2arc_write_eligible(uint64_t, arc_buf_hdr_t *);
803static void l2arc_read_done(zio_t *);
34dc7c2f 804
b9541d6b 805static boolean_t l2arc_compress_buf(arc_buf_hdr_t *);
2a432414
GW
806static void l2arc_decompress_zio(zio_t *, arc_buf_hdr_t *, enum zio_compress);
807static void l2arc_release_cdata_buf(arc_buf_hdr_t *);
3a17a7a9 808
34dc7c2f 809static uint64_t
d164b209 810buf_hash(uint64_t spa, const dva_t *dva, uint64_t birth)
34dc7c2f 811{
34dc7c2f
BB
812 uint8_t *vdva = (uint8_t *)dva;
813 uint64_t crc = -1ULL;
814 int i;
815
816 ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY);
817
818 for (i = 0; i < sizeof (dva_t); i++)
819 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ vdva[i]) & 0xFF];
820
d164b209 821 crc ^= (spa>>8) ^ birth;
34dc7c2f
BB
822
823 return (crc);
824}
825
826#define BUF_EMPTY(buf) \
827 ((buf)->b_dva.dva_word[0] == 0 && \
b9541d6b 828 (buf)->b_dva.dva_word[1] == 0)
34dc7c2f
BB
829
830#define BUF_EQUAL(spa, dva, birth, buf) \
831 ((buf)->b_dva.dva_word[0] == (dva)->dva_word[0]) && \
832 ((buf)->b_dva.dva_word[1] == (dva)->dva_word[1]) && \
833 ((buf)->b_birth == birth) && ((buf)->b_spa == spa)
834
428870ff
BB
835static void
836buf_discard_identity(arc_buf_hdr_t *hdr)
837{
838 hdr->b_dva.dva_word[0] = 0;
839 hdr->b_dva.dva_word[1] = 0;
840 hdr->b_birth = 0;
428870ff
BB
841}
842
34dc7c2f 843static arc_buf_hdr_t *
9b67f605 844buf_hash_find(uint64_t spa, const blkptr_t *bp, kmutex_t **lockp)
34dc7c2f 845{
9b67f605
MA
846 const dva_t *dva = BP_IDENTITY(bp);
847 uint64_t birth = BP_PHYSICAL_BIRTH(bp);
34dc7c2f
BB
848 uint64_t idx = BUF_HASH_INDEX(spa, dva, birth);
849 kmutex_t *hash_lock = BUF_HASH_LOCK(idx);
2a432414 850 arc_buf_hdr_t *hdr;
34dc7c2f
BB
851
852 mutex_enter(hash_lock);
2a432414
GW
853 for (hdr = buf_hash_table.ht_table[idx]; hdr != NULL;
854 hdr = hdr->b_hash_next) {
855 if (BUF_EQUAL(spa, dva, birth, hdr)) {
34dc7c2f 856 *lockp = hash_lock;
2a432414 857 return (hdr);
34dc7c2f
BB
858 }
859 }
860 mutex_exit(hash_lock);
861 *lockp = NULL;
862 return (NULL);
863}
864
865/*
866 * Insert an entry into the hash table. If there is already an element
867 * equal to elem in the hash table, then the already existing element
868 * will be returned and the new element will not be inserted.
869 * Otherwise returns NULL.
b9541d6b 870 * If lockp == NULL, the caller is assumed to already hold the hash lock.
34dc7c2f
BB
871 */
872static arc_buf_hdr_t *
2a432414 873buf_hash_insert(arc_buf_hdr_t *hdr, kmutex_t **lockp)
34dc7c2f 874{
2a432414 875 uint64_t idx = BUF_HASH_INDEX(hdr->b_spa, &hdr->b_dva, hdr->b_birth);
34dc7c2f 876 kmutex_t *hash_lock = BUF_HASH_LOCK(idx);
2a432414 877 arc_buf_hdr_t *fhdr;
34dc7c2f
BB
878 uint32_t i;
879
2a432414
GW
880 ASSERT(!DVA_IS_EMPTY(&hdr->b_dva));
881 ASSERT(hdr->b_birth != 0);
882 ASSERT(!HDR_IN_HASH_TABLE(hdr));
b9541d6b
CW
883
884 if (lockp != NULL) {
885 *lockp = hash_lock;
886 mutex_enter(hash_lock);
887 } else {
888 ASSERT(MUTEX_HELD(hash_lock));
889 }
890
2a432414
GW
891 for (fhdr = buf_hash_table.ht_table[idx], i = 0; fhdr != NULL;
892 fhdr = fhdr->b_hash_next, i++) {
893 if (BUF_EQUAL(hdr->b_spa, &hdr->b_dva, hdr->b_birth, fhdr))
894 return (fhdr);
34dc7c2f
BB
895 }
896
2a432414
GW
897 hdr->b_hash_next = buf_hash_table.ht_table[idx];
898 buf_hash_table.ht_table[idx] = hdr;
899 hdr->b_flags |= ARC_FLAG_IN_HASH_TABLE;
34dc7c2f
BB
900
901 /* collect some hash table performance data */
902 if (i > 0) {
903 ARCSTAT_BUMP(arcstat_hash_collisions);
904 if (i == 1)
905 ARCSTAT_BUMP(arcstat_hash_chains);
906
907 ARCSTAT_MAX(arcstat_hash_chain_max, i);
908 }
909
910 ARCSTAT_BUMP(arcstat_hash_elements);
911 ARCSTAT_MAXSTAT(arcstat_hash_elements);
912
913 return (NULL);
914}
915
916static void
2a432414 917buf_hash_remove(arc_buf_hdr_t *hdr)
34dc7c2f 918{
2a432414
GW
919 arc_buf_hdr_t *fhdr, **hdrp;
920 uint64_t idx = BUF_HASH_INDEX(hdr->b_spa, &hdr->b_dva, hdr->b_birth);
34dc7c2f
BB
921
922 ASSERT(MUTEX_HELD(BUF_HASH_LOCK(idx)));
2a432414 923 ASSERT(HDR_IN_HASH_TABLE(hdr));
34dc7c2f 924
2a432414
GW
925 hdrp = &buf_hash_table.ht_table[idx];
926 while ((fhdr = *hdrp) != hdr) {
927 ASSERT(fhdr != NULL);
928 hdrp = &fhdr->b_hash_next;
34dc7c2f 929 }
2a432414
GW
930 *hdrp = hdr->b_hash_next;
931 hdr->b_hash_next = NULL;
932 hdr->b_flags &= ~ARC_FLAG_IN_HASH_TABLE;
34dc7c2f
BB
933
934 /* collect some hash table performance data */
935 ARCSTAT_BUMPDOWN(arcstat_hash_elements);
936
937 if (buf_hash_table.ht_table[idx] &&
938 buf_hash_table.ht_table[idx]->b_hash_next == NULL)
939 ARCSTAT_BUMPDOWN(arcstat_hash_chains);
940}
941
942/*
943 * Global data structures and functions for the buf kmem cache.
944 */
b9541d6b
CW
945static kmem_cache_t *hdr_full_cache;
946static kmem_cache_t *hdr_l2only_cache;
34dc7c2f
BB
947static kmem_cache_t *buf_cache;
948
949static void
950buf_fini(void)
951{
952 int i;
953
00b46022 954#if defined(_KERNEL) && defined(HAVE_SPL)
d1d7e268
MK
955 /*
956 * Large allocations which do not require contiguous pages
957 * should be using vmem_free() in the linux kernel\
958 */
00b46022
BB
959 vmem_free(buf_hash_table.ht_table,
960 (buf_hash_table.ht_mask + 1) * sizeof (void *));
961#else
34dc7c2f
BB
962 kmem_free(buf_hash_table.ht_table,
963 (buf_hash_table.ht_mask + 1) * sizeof (void *));
00b46022 964#endif
34dc7c2f
BB
965 for (i = 0; i < BUF_LOCKS; i++)
966 mutex_destroy(&buf_hash_table.ht_locks[i].ht_lock);
b9541d6b
CW
967 kmem_cache_destroy(hdr_full_cache);
968 kmem_cache_destroy(hdr_l2only_cache);
34dc7c2f
BB
969 kmem_cache_destroy(buf_cache);
970}
971
972/*
973 * Constructor callback - called when the cache is empty
974 * and a new buf is requested.
975 */
976/* ARGSUSED */
977static int
b9541d6b
CW
978hdr_full_cons(void *vbuf, void *unused, int kmflag)
979{
980 arc_buf_hdr_t *hdr = vbuf;
981
982 bzero(hdr, HDR_FULL_SIZE);
983 cv_init(&hdr->b_l1hdr.b_cv, NULL, CV_DEFAULT, NULL);
984 refcount_create(&hdr->b_l1hdr.b_refcnt);
985 mutex_init(&hdr->b_l1hdr.b_freeze_lock, NULL, MUTEX_DEFAULT, NULL);
986 list_link_init(&hdr->b_l1hdr.b_arc_node);
987 list_link_init(&hdr->b_l2hdr.b_l2node);
ca0bf58d 988 multilist_link_init(&hdr->b_l1hdr.b_arc_node);
b9541d6b
CW
989 arc_space_consume(HDR_FULL_SIZE, ARC_SPACE_HDRS);
990
991 return (0);
992}
993
994/* ARGSUSED */
995static int
996hdr_l2only_cons(void *vbuf, void *unused, int kmflag)
34dc7c2f 997{
2a432414
GW
998 arc_buf_hdr_t *hdr = vbuf;
999
b9541d6b
CW
1000 bzero(hdr, HDR_L2ONLY_SIZE);
1001 arc_space_consume(HDR_L2ONLY_SIZE, ARC_SPACE_L2HDRS);
34dc7c2f 1002
34dc7c2f
BB
1003 return (0);
1004}
1005
b128c09f
BB
1006/* ARGSUSED */
1007static int
1008buf_cons(void *vbuf, void *unused, int kmflag)
1009{
1010 arc_buf_t *buf = vbuf;
1011
1012 bzero(buf, sizeof (arc_buf_t));
428870ff 1013 mutex_init(&buf->b_evict_lock, NULL, MUTEX_DEFAULT, NULL);
d164b209
BB
1014 arc_space_consume(sizeof (arc_buf_t), ARC_SPACE_HDRS);
1015
b128c09f
BB
1016 return (0);
1017}
1018
34dc7c2f
BB
1019/*
1020 * Destructor callback - called when a cached buf is
1021 * no longer required.
1022 */
1023/* ARGSUSED */
1024static void
b9541d6b 1025hdr_full_dest(void *vbuf, void *unused)
34dc7c2f 1026{
2a432414 1027 arc_buf_hdr_t *hdr = vbuf;
34dc7c2f 1028
2a432414 1029 ASSERT(BUF_EMPTY(hdr));
b9541d6b
CW
1030 cv_destroy(&hdr->b_l1hdr.b_cv);
1031 refcount_destroy(&hdr->b_l1hdr.b_refcnt);
1032 mutex_destroy(&hdr->b_l1hdr.b_freeze_lock);
ca0bf58d 1033 ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node));
b9541d6b
CW
1034 arc_space_return(HDR_FULL_SIZE, ARC_SPACE_HDRS);
1035}
1036
1037/* ARGSUSED */
1038static void
1039hdr_l2only_dest(void *vbuf, void *unused)
1040{
1041 ASSERTV(arc_buf_hdr_t *hdr = vbuf);
1042
1043 ASSERT(BUF_EMPTY(hdr));
1044 arc_space_return(HDR_L2ONLY_SIZE, ARC_SPACE_L2HDRS);
34dc7c2f
BB
1045}
1046
b128c09f
BB
1047/* ARGSUSED */
1048static void
1049buf_dest(void *vbuf, void *unused)
1050{
1051 arc_buf_t *buf = vbuf;
1052
428870ff 1053 mutex_destroy(&buf->b_evict_lock);
d164b209 1054 arc_space_return(sizeof (arc_buf_t), ARC_SPACE_HDRS);
b128c09f
BB
1055}
1056
8c8af9d8
BB
1057/*
1058 * Reclaim callback -- invoked when memory is low.
1059 */
1060/* ARGSUSED */
1061static void
1062hdr_recl(void *unused)
1063{
1064 dprintf("hdr_recl called\n");
1065 /*
1066 * umem calls the reclaim func when we destroy the buf cache,
1067 * which is after we do arc_fini().
1068 */
1069 if (!arc_dead)
1070 cv_signal(&arc_reclaim_thread_cv);
1071}
1072
34dc7c2f
BB
1073static void
1074buf_init(void)
1075{
1076 uint64_t *ct;
1077 uint64_t hsize = 1ULL << 12;
1078 int i, j;
1079
1080 /*
1081 * The hash table is big enough to fill all of physical memory
49ddb315
MA
1082 * with an average block size of zfs_arc_average_blocksize (default 8K).
1083 * By default, the table will take up
1084 * totalmem * sizeof(void*) / 8K (1MB per GB with 8-byte pointers).
34dc7c2f 1085 */
49ddb315 1086 while (hsize * zfs_arc_average_blocksize < physmem * PAGESIZE)
34dc7c2f
BB
1087 hsize <<= 1;
1088retry:
1089 buf_hash_table.ht_mask = hsize - 1;
00b46022 1090#if defined(_KERNEL) && defined(HAVE_SPL)
d1d7e268
MK
1091 /*
1092 * Large allocations which do not require contiguous pages
1093 * should be using vmem_alloc() in the linux kernel
1094 */
00b46022
BB
1095 buf_hash_table.ht_table =
1096 vmem_zalloc(hsize * sizeof (void*), KM_SLEEP);
1097#else
34dc7c2f
BB
1098 buf_hash_table.ht_table =
1099 kmem_zalloc(hsize * sizeof (void*), KM_NOSLEEP);
00b46022 1100#endif
34dc7c2f
BB
1101 if (buf_hash_table.ht_table == NULL) {
1102 ASSERT(hsize > (1ULL << 8));
1103 hsize >>= 1;
1104 goto retry;
1105 }
1106
b9541d6b 1107 hdr_full_cache = kmem_cache_create("arc_buf_hdr_t_full", HDR_FULL_SIZE,
8c8af9d8 1108 0, hdr_full_cons, hdr_full_dest, hdr_recl, NULL, NULL, 0);
b9541d6b 1109 hdr_l2only_cache = kmem_cache_create("arc_buf_hdr_t_l2only",
8c8af9d8 1110 HDR_L2ONLY_SIZE, 0, hdr_l2only_cons, hdr_l2only_dest, hdr_recl,
b9541d6b 1111 NULL, NULL, 0);
34dc7c2f 1112 buf_cache = kmem_cache_create("arc_buf_t", sizeof (arc_buf_t),
b128c09f 1113 0, buf_cons, buf_dest, NULL, NULL, NULL, 0);
34dc7c2f
BB
1114
1115 for (i = 0; i < 256; i++)
1116 for (ct = zfs_crc64_table + i, *ct = i, j = 8; j > 0; j--)
1117 *ct = (*ct >> 1) ^ (-(*ct & 1) & ZFS_CRC64_POLY);
1118
1119 for (i = 0; i < BUF_LOCKS; i++) {
1120 mutex_init(&buf_hash_table.ht_locks[i].ht_lock,
40d06e3c 1121 NULL, MUTEX_DEFAULT, NULL);
34dc7c2f
BB
1122 }
1123}
1124
b9541d6b
CW
1125/*
1126 * Transition between the two allocation states for the arc_buf_hdr struct.
1127 * The arc_buf_hdr struct can be allocated with (hdr_full_cache) or without
1128 * (hdr_l2only_cache) the fields necessary for the L1 cache - the smaller
1129 * version is used when a cache buffer is only in the L2ARC in order to reduce
1130 * memory usage.
1131 */
1132static arc_buf_hdr_t *
1133arc_hdr_realloc(arc_buf_hdr_t *hdr, kmem_cache_t *old, kmem_cache_t *new)
1134{
1135 arc_buf_hdr_t *nhdr;
1136 l2arc_dev_t *dev;
1137
1138 ASSERT(HDR_HAS_L2HDR(hdr));
1139 ASSERT((old == hdr_full_cache && new == hdr_l2only_cache) ||
1140 (old == hdr_l2only_cache && new == hdr_full_cache));
1141
1142 dev = hdr->b_l2hdr.b_dev;
1143 nhdr = kmem_cache_alloc(new, KM_PUSHPAGE);
1144
1145 ASSERT(MUTEX_HELD(HDR_LOCK(hdr)));
1146 buf_hash_remove(hdr);
1147
1148 bcopy(hdr, nhdr, HDR_L2ONLY_SIZE);
d962d5da 1149
b9541d6b
CW
1150 if (new == hdr_full_cache) {
1151 nhdr->b_flags |= ARC_FLAG_HAS_L1HDR;
1152 /*
1153 * arc_access and arc_change_state need to be aware that a
1154 * header has just come out of L2ARC, so we set its state to
1155 * l2c_only even though it's about to change.
1156 */
1157 nhdr->b_l1hdr.b_state = arc_l2c_only;
ca0bf58d
PS
1158
1159 /* Verify previous threads set to NULL before freeing */
1160 ASSERT3P(nhdr->b_l1hdr.b_tmp_cdata, ==, NULL);
b9541d6b
CW
1161 } else {
1162 ASSERT(hdr->b_l1hdr.b_buf == NULL);
1163 ASSERT0(hdr->b_l1hdr.b_datacnt);
ca0bf58d
PS
1164
1165 /*
1166 * If we've reached here, We must have been called from
1167 * arc_evict_hdr(), as such we should have already been
1168 * removed from any ghost list we were previously on
1169 * (which protects us from racing with arc_evict_state),
1170 * thus no locking is needed during this check.
1171 */
1172 ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node));
1173
b9541d6b 1174 /*
ca0bf58d
PS
1175 * A buffer must not be moved into the arc_l2c_only
1176 * state if it's not finished being written out to the
1177 * l2arc device. Otherwise, the b_l1hdr.b_tmp_cdata field
1178 * might try to be accessed, even though it was removed.
b9541d6b 1179 */
ca0bf58d
PS
1180 VERIFY(!HDR_L2_WRITING(hdr));
1181 VERIFY3P(hdr->b_l1hdr.b_tmp_cdata, ==, NULL);
1182
b9541d6b
CW
1183 nhdr->b_flags &= ~ARC_FLAG_HAS_L1HDR;
1184 }
1185 /*
1186 * The header has been reallocated so we need to re-insert it into any
1187 * lists it was on.
1188 */
1189 (void) buf_hash_insert(nhdr, NULL);
1190
1191 ASSERT(list_link_active(&hdr->b_l2hdr.b_l2node));
1192
1193 mutex_enter(&dev->l2ad_mtx);
1194
1195 /*
1196 * We must place the realloc'ed header back into the list at
1197 * the same spot. Otherwise, if it's placed earlier in the list,
1198 * l2arc_write_buffers() could find it during the function's
1199 * write phase, and try to write it out to the l2arc.
1200 */
1201 list_insert_after(&dev->l2ad_buflist, hdr, nhdr);
1202 list_remove(&dev->l2ad_buflist, hdr);
1203
1204 mutex_exit(&dev->l2ad_mtx);
1205
d962d5da
PS
1206 /*
1207 * Since we're using the pointer address as the tag when
1208 * incrementing and decrementing the l2ad_alloc refcount, we
1209 * must remove the old pointer (that we're about to destroy) and
1210 * add the new pointer to the refcount. Otherwise we'd remove
1211 * the wrong pointer address when calling arc_hdr_destroy() later.
1212 */
1213
1214 (void) refcount_remove_many(&dev->l2ad_alloc,
1215 hdr->b_l2hdr.b_asize, hdr);
1216
1217 (void) refcount_add_many(&dev->l2ad_alloc,
1218 nhdr->b_l2hdr.b_asize, nhdr);
1219
b9541d6b
CW
1220 buf_discard_identity(hdr);
1221 hdr->b_freeze_cksum = NULL;
1222 kmem_cache_free(old, hdr);
1223
1224 return (nhdr);
1225}
1226
1227
34dc7c2f
BB
1228#define ARC_MINTIME (hz>>4) /* 62 ms */
1229
1230static void
1231arc_cksum_verify(arc_buf_t *buf)
1232{
1233 zio_cksum_t zc;
1234
1235 if (!(zfs_flags & ZFS_DEBUG_MODIFY))
1236 return;
1237
b9541d6b
CW
1238 mutex_enter(&buf->b_hdr->b_l1hdr.b_freeze_lock);
1239 if (buf->b_hdr->b_freeze_cksum == NULL || HDR_IO_ERROR(buf->b_hdr)) {
1240 mutex_exit(&buf->b_hdr->b_l1hdr.b_freeze_lock);
34dc7c2f
BB
1241 return;
1242 }
1243 fletcher_2_native(buf->b_data, buf->b_hdr->b_size, &zc);
1244 if (!ZIO_CHECKSUM_EQUAL(*buf->b_hdr->b_freeze_cksum, zc))
1245 panic("buffer modified while frozen!");
b9541d6b 1246 mutex_exit(&buf->b_hdr->b_l1hdr.b_freeze_lock);
34dc7c2f
BB
1247}
1248
1249static int
1250arc_cksum_equal(arc_buf_t *buf)
1251{
1252 zio_cksum_t zc;
1253 int equal;
1254
b9541d6b 1255 mutex_enter(&buf->b_hdr->b_l1hdr.b_freeze_lock);
34dc7c2f
BB
1256 fletcher_2_native(buf->b_data, buf->b_hdr->b_size, &zc);
1257 equal = ZIO_CHECKSUM_EQUAL(*buf->b_hdr->b_freeze_cksum, zc);
b9541d6b 1258 mutex_exit(&buf->b_hdr->b_l1hdr.b_freeze_lock);
34dc7c2f
BB
1259
1260 return (equal);
1261}
1262
1263static void
1264arc_cksum_compute(arc_buf_t *buf, boolean_t force)
1265{
1266 if (!force && !(zfs_flags & ZFS_DEBUG_MODIFY))
1267 return;
1268
b9541d6b 1269 mutex_enter(&buf->b_hdr->b_l1hdr.b_freeze_lock);
34dc7c2f 1270 if (buf->b_hdr->b_freeze_cksum != NULL) {
b9541d6b 1271 mutex_exit(&buf->b_hdr->b_l1hdr.b_freeze_lock);
34dc7c2f
BB
1272 return;
1273 }
409dc1a5 1274 buf->b_hdr->b_freeze_cksum = kmem_alloc(sizeof (zio_cksum_t),
79c76d5b 1275 KM_SLEEP);
34dc7c2f
BB
1276 fletcher_2_native(buf->b_data, buf->b_hdr->b_size,
1277 buf->b_hdr->b_freeze_cksum);
b9541d6b 1278 mutex_exit(&buf->b_hdr->b_l1hdr.b_freeze_lock);
498877ba
MA
1279 arc_buf_watch(buf);
1280}
1281
1282#ifndef _KERNEL
1283void
1284arc_buf_sigsegv(int sig, siginfo_t *si, void *unused)
1285{
1286 panic("Got SIGSEGV at address: 0x%lx\n", (long) si->si_addr);
1287}
1288#endif
1289
1290/* ARGSUSED */
1291static void
1292arc_buf_unwatch(arc_buf_t *buf)
1293{
1294#ifndef _KERNEL
1295 if (arc_watch) {
1296 ASSERT0(mprotect(buf->b_data, buf->b_hdr->b_size,
1297 PROT_READ | PROT_WRITE));
1298 }
1299#endif
1300}
1301
1302/* ARGSUSED */
1303static void
1304arc_buf_watch(arc_buf_t *buf)
1305{
1306#ifndef _KERNEL
1307 if (arc_watch)
1308 ASSERT0(mprotect(buf->b_data, buf->b_hdr->b_size, PROT_READ));
1309#endif
34dc7c2f
BB
1310}
1311
b9541d6b
CW
1312static arc_buf_contents_t
1313arc_buf_type(arc_buf_hdr_t *hdr)
1314{
1315 if (HDR_ISTYPE_METADATA(hdr)) {
1316 return (ARC_BUFC_METADATA);
1317 } else {
1318 return (ARC_BUFC_DATA);
1319 }
1320}
1321
1322static uint32_t
1323arc_bufc_to_flags(arc_buf_contents_t type)
1324{
1325 switch (type) {
1326 case ARC_BUFC_DATA:
1327 /* metadata field is 0 if buffer contains normal data */
1328 return (0);
1329 case ARC_BUFC_METADATA:
1330 return (ARC_FLAG_BUFC_METADATA);
1331 default:
1332 break;
1333 }
1334 panic("undefined ARC buffer type!");
1335 return ((uint32_t)-1);
1336}
1337
34dc7c2f
BB
1338void
1339arc_buf_thaw(arc_buf_t *buf)
1340{
1341 if (zfs_flags & ZFS_DEBUG_MODIFY) {
b9541d6b 1342 if (buf->b_hdr->b_l1hdr.b_state != arc_anon)
34dc7c2f 1343 panic("modifying non-anon buffer!");
b9541d6b 1344 if (HDR_IO_IN_PROGRESS(buf->b_hdr))
34dc7c2f
BB
1345 panic("modifying buffer while i/o in progress!");
1346 arc_cksum_verify(buf);
1347 }
1348
b9541d6b 1349 mutex_enter(&buf->b_hdr->b_l1hdr.b_freeze_lock);
34dc7c2f
BB
1350 if (buf->b_hdr->b_freeze_cksum != NULL) {
1351 kmem_free(buf->b_hdr->b_freeze_cksum, sizeof (zio_cksum_t));
1352 buf->b_hdr->b_freeze_cksum = NULL;
1353 }
428870ff 1354
b9541d6b 1355 mutex_exit(&buf->b_hdr->b_l1hdr.b_freeze_lock);
498877ba
MA
1356
1357 arc_buf_unwatch(buf);
34dc7c2f
BB
1358}
1359
1360void
1361arc_buf_freeze(arc_buf_t *buf)
1362{
428870ff
BB
1363 kmutex_t *hash_lock;
1364
34dc7c2f
BB
1365 if (!(zfs_flags & ZFS_DEBUG_MODIFY))
1366 return;
1367
428870ff
BB
1368 hash_lock = HDR_LOCK(buf->b_hdr);
1369 mutex_enter(hash_lock);
1370
34dc7c2f 1371 ASSERT(buf->b_hdr->b_freeze_cksum != NULL ||
b9541d6b 1372 buf->b_hdr->b_l1hdr.b_state == arc_anon);
34dc7c2f 1373 arc_cksum_compute(buf, B_FALSE);
428870ff 1374 mutex_exit(hash_lock);
498877ba 1375
34dc7c2f
BB
1376}
1377
1378static void
2a432414 1379add_reference(arc_buf_hdr_t *hdr, kmutex_t *hash_lock, void *tag)
34dc7c2f 1380{
b9541d6b
CW
1381 arc_state_t *state;
1382
1383 ASSERT(HDR_HAS_L1HDR(hdr));
34dc7c2f
BB
1384 ASSERT(MUTEX_HELD(hash_lock));
1385
b9541d6b
CW
1386 state = hdr->b_l1hdr.b_state;
1387
1388 if ((refcount_add(&hdr->b_l1hdr.b_refcnt, tag) == 1) &&
1389 (state != arc_anon)) {
1390 /* We don't use the L2-only state list. */
1391 if (state != arc_l2c_only) {
ca0bf58d 1392 arc_buf_contents_t type = arc_buf_type(hdr);
b9541d6b 1393 uint64_t delta = hdr->b_size * hdr->b_l1hdr.b_datacnt;
ca0bf58d
PS
1394 multilist_t *list = &state->arcs_list[type];
1395 uint64_t *size = &state->arcs_lsize[type];
1396
1397 multilist_remove(list, hdr);
b9541d6b 1398
b9541d6b
CW
1399 if (GHOST_STATE(state)) {
1400 ASSERT0(hdr->b_l1hdr.b_datacnt);
1401 ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL);
1402 delta = hdr->b_size;
1403 }
1404 ASSERT(delta > 0);
1405 ASSERT3U(*size, >=, delta);
1406 atomic_add_64(size, -delta);
34dc7c2f 1407 }
b128c09f 1408 /* remove the prefetch flag if we get a reference */
b9541d6b 1409 hdr->b_flags &= ~ARC_FLAG_PREFETCH;
34dc7c2f
BB
1410 }
1411}
1412
1413static int
2a432414 1414remove_reference(arc_buf_hdr_t *hdr, kmutex_t *hash_lock, void *tag)
34dc7c2f
BB
1415{
1416 int cnt;
b9541d6b 1417 arc_state_t *state = hdr->b_l1hdr.b_state;
34dc7c2f 1418
b9541d6b 1419 ASSERT(HDR_HAS_L1HDR(hdr));
34dc7c2f
BB
1420 ASSERT(state == arc_anon || MUTEX_HELD(hash_lock));
1421 ASSERT(!GHOST_STATE(state));
1422
b9541d6b
CW
1423 /*
1424 * arc_l2c_only counts as a ghost state so we don't need to explicitly
1425 * check to prevent usage of the arc_l2c_only list.
1426 */
1427 if (((cnt = refcount_remove(&hdr->b_l1hdr.b_refcnt, tag)) == 0) &&
34dc7c2f 1428 (state != arc_anon)) {
ca0bf58d
PS
1429 arc_buf_contents_t type = arc_buf_type(hdr);
1430 multilist_t *list = &state->arcs_list[type];
1431 uint64_t *size = &state->arcs_lsize[type];
1432
1433 multilist_insert(list, hdr);
34dc7c2f 1434
b9541d6b
CW
1435 ASSERT(hdr->b_l1hdr.b_datacnt > 0);
1436 atomic_add_64(size, hdr->b_size *
1437 hdr->b_l1hdr.b_datacnt);
34dc7c2f
BB
1438 }
1439 return (cnt);
1440}
1441
e0b0ca98
BB
1442/*
1443 * Returns detailed information about a specific arc buffer. When the
1444 * state_index argument is set the function will calculate the arc header
1445 * list position for its arc state. Since this requires a linear traversal
1446 * callers are strongly encourage not to do this. However, it can be helpful
1447 * for targeted analysis so the functionality is provided.
1448 */
1449void
1450arc_buf_info(arc_buf_t *ab, arc_buf_info_t *abi, int state_index)
1451{
1452 arc_buf_hdr_t *hdr = ab->b_hdr;
b9541d6b
CW
1453 l1arc_buf_hdr_t *l1hdr = NULL;
1454 l2arc_buf_hdr_t *l2hdr = NULL;
1455 arc_state_t *state = NULL;
1456
1457 if (HDR_HAS_L1HDR(hdr)) {
1458 l1hdr = &hdr->b_l1hdr;
1459 state = l1hdr->b_state;
1460 }
1461 if (HDR_HAS_L2HDR(hdr))
1462 l2hdr = &hdr->b_l2hdr;
e0b0ca98 1463
d1d7e268 1464 memset(abi, 0, sizeof (arc_buf_info_t));
e0b0ca98 1465 abi->abi_flags = hdr->b_flags;
b9541d6b
CW
1466
1467 if (l1hdr) {
1468 abi->abi_datacnt = l1hdr->b_datacnt;
1469 abi->abi_access = l1hdr->b_arc_access;
1470 abi->abi_mru_hits = l1hdr->b_mru_hits;
1471 abi->abi_mru_ghost_hits = l1hdr->b_mru_ghost_hits;
1472 abi->abi_mfu_hits = l1hdr->b_mfu_hits;
1473 abi->abi_mfu_ghost_hits = l1hdr->b_mfu_ghost_hits;
1474 abi->abi_holds = refcount_count(&l1hdr->b_refcnt);
1475 }
1476
1477 if (l2hdr) {
1478 abi->abi_l2arc_dattr = l2hdr->b_daddr;
1479 abi->abi_l2arc_asize = l2hdr->b_asize;
1480 abi->abi_l2arc_compress = HDR_GET_COMPRESS(hdr);
1481 abi->abi_l2arc_hits = l2hdr->b_hits;
1482 }
1483
e0b0ca98 1484 abi->abi_state_type = state ? state->arcs_state : ARC_STATE_ANON;
b9541d6b 1485 abi->abi_state_contents = arc_buf_type(hdr);
e0b0ca98 1486 abi->abi_size = hdr->b_size;
e0b0ca98
BB
1487}
1488
34dc7c2f 1489/*
ca0bf58d 1490 * Move the supplied buffer to the indicated state. The hash lock
34dc7c2f
BB
1491 * for the buffer must be held by the caller.
1492 */
1493static void
2a432414
GW
1494arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr,
1495 kmutex_t *hash_lock)
34dc7c2f 1496{
b9541d6b
CW
1497 arc_state_t *old_state;
1498 int64_t refcnt;
1499 uint32_t datacnt;
34dc7c2f 1500 uint64_t from_delta, to_delta;
b9541d6b
CW
1501 arc_buf_contents_t buftype = arc_buf_type(hdr);
1502
1503 /*
1504 * We almost always have an L1 hdr here, since we call arc_hdr_realloc()
1505 * in arc_read() when bringing a buffer out of the L2ARC. However, the
1506 * L1 hdr doesn't always exist when we change state to arc_anon before
1507 * destroying a header, in which case reallocating to add the L1 hdr is
1508 * pointless.
1509 */
1510 if (HDR_HAS_L1HDR(hdr)) {
1511 old_state = hdr->b_l1hdr.b_state;
1512 refcnt = refcount_count(&hdr->b_l1hdr.b_refcnt);
1513 datacnt = hdr->b_l1hdr.b_datacnt;
1514 } else {
1515 old_state = arc_l2c_only;
1516 refcnt = 0;
1517 datacnt = 0;
1518 }
34dc7c2f
BB
1519
1520 ASSERT(MUTEX_HELD(hash_lock));
e8b96c60 1521 ASSERT3P(new_state, !=, old_state);
b9541d6b
CW
1522 ASSERT(refcnt == 0 || datacnt > 0);
1523 ASSERT(!GHOST_STATE(new_state) || datacnt == 0);
1524 ASSERT(old_state != arc_anon || datacnt <= 1);
34dc7c2f 1525
b9541d6b 1526 from_delta = to_delta = datacnt * hdr->b_size;
34dc7c2f
BB
1527
1528 /*
1529 * If this buffer is evictable, transfer it from the
1530 * old state list to the new state list.
1531 */
1532 if (refcnt == 0) {
b9541d6b 1533 if (old_state != arc_anon && old_state != arc_l2c_only) {
b9541d6b 1534 uint64_t *size = &old_state->arcs_lsize[buftype];
34dc7c2f 1535
b9541d6b 1536 ASSERT(HDR_HAS_L1HDR(hdr));
ca0bf58d 1537 multilist_remove(&old_state->arcs_list[buftype], hdr);
34dc7c2f
BB
1538
1539 /*
1540 * If prefetching out of the ghost cache,
428870ff 1541 * we will have a non-zero datacnt.
34dc7c2f 1542 */
b9541d6b 1543 if (GHOST_STATE(old_state) && datacnt == 0) {
34dc7c2f 1544 /* ghost elements have a ghost size */
b9541d6b 1545 ASSERT(hdr->b_l1hdr.b_buf == NULL);
2a432414 1546 from_delta = hdr->b_size;
34dc7c2f
BB
1547 }
1548 ASSERT3U(*size, >=, from_delta);
1549 atomic_add_64(size, -from_delta);
34dc7c2f 1550 }
b9541d6b 1551 if (new_state != arc_anon && new_state != arc_l2c_only) {
b9541d6b 1552 uint64_t *size = &new_state->arcs_lsize[buftype];
34dc7c2f 1553
b9541d6b
CW
1554 /*
1555 * An L1 header always exists here, since if we're
1556 * moving to some L1-cached state (i.e. not l2c_only or
1557 * anonymous), we realloc the header to add an L1hdr
1558 * beforehand.
1559 */
1560 ASSERT(HDR_HAS_L1HDR(hdr));
ca0bf58d 1561 multilist_insert(&new_state->arcs_list[buftype], hdr);
34dc7c2f
BB
1562
1563 /* ghost elements have a ghost size */
1564 if (GHOST_STATE(new_state)) {
b9541d6b
CW
1565 ASSERT0(datacnt);
1566 ASSERT(hdr->b_l1hdr.b_buf == NULL);
2a432414 1567 to_delta = hdr->b_size;
34dc7c2f
BB
1568 }
1569 atomic_add_64(size, to_delta);
34dc7c2f
BB
1570 }
1571 }
1572
2a432414
GW
1573 ASSERT(!BUF_EMPTY(hdr));
1574 if (new_state == arc_anon && HDR_IN_HASH_TABLE(hdr))
1575 buf_hash_remove(hdr);
34dc7c2f 1576
b9541d6b 1577 /* adjust state sizes (ignore arc_l2c_only) */
36da08ef
PS
1578
1579 if (to_delta && new_state != arc_l2c_only) {
1580 ASSERT(HDR_HAS_L1HDR(hdr));
1581 if (GHOST_STATE(new_state)) {
1582 ASSERT0(datacnt);
1583
1584 /*
1585 * We moving a header to a ghost state, we first
1586 * remove all arc buffers. Thus, we'll have a
1587 * datacnt of zero, and no arc buffer to use for
1588 * the reference. As a result, we use the arc
1589 * header pointer for the reference.
1590 */
1591 (void) refcount_add_many(&new_state->arcs_size,
1592 hdr->b_size, hdr);
1593 } else {
1594 arc_buf_t *buf;
1595 ASSERT3U(datacnt, !=, 0);
1596
1597 /*
1598 * Each individual buffer holds a unique reference,
1599 * thus we must remove each of these references one
1600 * at a time.
1601 */
1602 for (buf = hdr->b_l1hdr.b_buf; buf != NULL;
1603 buf = buf->b_next) {
1604 (void) refcount_add_many(&new_state->arcs_size,
1605 hdr->b_size, buf);
1606 }
1607 }
1608 }
1609
b9541d6b 1610 if (from_delta && old_state != arc_l2c_only) {
36da08ef
PS
1611 ASSERT(HDR_HAS_L1HDR(hdr));
1612 if (GHOST_STATE(old_state)) {
1613 /*
1614 * When moving a header off of a ghost state,
1615 * there's the possibility for datacnt to be
1616 * non-zero. This is because we first add the
1617 * arc buffer to the header prior to changing
1618 * the header's state. Since we used the header
1619 * for the reference when putting the header on
1620 * the ghost state, we must balance that and use
1621 * the header when removing off the ghost state
1622 * (even though datacnt is non zero).
1623 */
1624
1625 IMPLY(datacnt == 0, new_state == arc_anon ||
1626 new_state == arc_l2c_only);
1627
1628 (void) refcount_remove_many(&old_state->arcs_size,
1629 hdr->b_size, hdr);
1630 } else {
1631 arc_buf_t *buf;
1632 ASSERT3U(datacnt, !=, 0);
1633
1634 /*
1635 * Each individual buffer holds a unique reference,
1636 * thus we must remove each of these references one
1637 * at a time.
1638 */
1639 for (buf = hdr->b_l1hdr.b_buf; buf != NULL;
1640 buf = buf->b_next) {
1641 (void) refcount_remove_many(
1642 &old_state->arcs_size, hdr->b_size, buf);
1643 }
1644 }
34dc7c2f 1645 }
36da08ef 1646
b9541d6b
CW
1647 if (HDR_HAS_L1HDR(hdr))
1648 hdr->b_l1hdr.b_state = new_state;
34dc7c2f 1649
b9541d6b
CW
1650 /*
1651 * L2 headers should never be on the L2 state list since they don't
1652 * have L1 headers allocated.
1653 */
ca0bf58d
PS
1654 ASSERT(multilist_is_empty(&arc_l2c_only->arcs_list[ARC_BUFC_DATA]) &&
1655 multilist_is_empty(&arc_l2c_only->arcs_list[ARC_BUFC_METADATA]));
34dc7c2f
BB
1656}
1657
1658void
d164b209 1659arc_space_consume(uint64_t space, arc_space_type_t type)
34dc7c2f 1660{
d164b209
BB
1661 ASSERT(type >= 0 && type < ARC_SPACE_NUMTYPES);
1662
1663 switch (type) {
e75c13c3
BB
1664 default:
1665 break;
d164b209
BB
1666 case ARC_SPACE_DATA:
1667 ARCSTAT_INCR(arcstat_data_size, space);
1668 break;
cc7f677c 1669 case ARC_SPACE_META:
500445c0 1670 ARCSTAT_INCR(arcstat_metadata_size, space);
cc7f677c 1671 break;
d164b209
BB
1672 case ARC_SPACE_OTHER:
1673 ARCSTAT_INCR(arcstat_other_size, space);
1674 break;
1675 case ARC_SPACE_HDRS:
1676 ARCSTAT_INCR(arcstat_hdr_size, space);
1677 break;
1678 case ARC_SPACE_L2HDRS:
1679 ARCSTAT_INCR(arcstat_l2_hdr_size, space);
1680 break;
1681 }
1682
500445c0 1683 if (type != ARC_SPACE_DATA)
cc7f677c
PS
1684 ARCSTAT_INCR(arcstat_meta_used, space);
1685
34dc7c2f
BB
1686 atomic_add_64(&arc_size, space);
1687}
1688
1689void
d164b209 1690arc_space_return(uint64_t space, arc_space_type_t type)
34dc7c2f 1691{
d164b209
BB
1692 ASSERT(type >= 0 && type < ARC_SPACE_NUMTYPES);
1693
1694 switch (type) {
e75c13c3
BB
1695 default:
1696 break;
d164b209
BB
1697 case ARC_SPACE_DATA:
1698 ARCSTAT_INCR(arcstat_data_size, -space);
1699 break;
cc7f677c 1700 case ARC_SPACE_META:
500445c0 1701 ARCSTAT_INCR(arcstat_metadata_size, -space);
cc7f677c 1702 break;
d164b209
BB
1703 case ARC_SPACE_OTHER:
1704 ARCSTAT_INCR(arcstat_other_size, -space);
1705 break;
1706 case ARC_SPACE_HDRS:
1707 ARCSTAT_INCR(arcstat_hdr_size, -space);
1708 break;
1709 case ARC_SPACE_L2HDRS:
1710 ARCSTAT_INCR(arcstat_l2_hdr_size, -space);
1711 break;
1712 }
1713
cc7f677c
PS
1714 if (type != ARC_SPACE_DATA) {
1715 ASSERT(arc_meta_used >= space);
500445c0
PS
1716 if (arc_meta_max < arc_meta_used)
1717 arc_meta_max = arc_meta_used;
cc7f677c
PS
1718 ARCSTAT_INCR(arcstat_meta_used, -space);
1719 }
1720
34dc7c2f
BB
1721 ASSERT(arc_size >= space);
1722 atomic_add_64(&arc_size, -space);
1723}
1724
34dc7c2f 1725arc_buf_t *
5f6d0b6f 1726arc_buf_alloc(spa_t *spa, uint64_t size, void *tag, arc_buf_contents_t type)
34dc7c2f
BB
1727{
1728 arc_buf_hdr_t *hdr;
1729 arc_buf_t *buf;
1730
f1512ee6 1731 VERIFY3U(size, <=, spa_maxblocksize(spa));
b9541d6b 1732 hdr = kmem_cache_alloc(hdr_full_cache, KM_PUSHPAGE);
34dc7c2f 1733 ASSERT(BUF_EMPTY(hdr));
b9541d6b 1734 ASSERT3P(hdr->b_freeze_cksum, ==, NULL);
34dc7c2f 1735 hdr->b_size = size;
3541dc6d 1736 hdr->b_spa = spa_load_guid(spa);
b9541d6b
CW
1737 hdr->b_l1hdr.b_mru_hits = 0;
1738 hdr->b_l1hdr.b_mru_ghost_hits = 0;
1739 hdr->b_l1hdr.b_mfu_hits = 0;
1740 hdr->b_l1hdr.b_mfu_ghost_hits = 0;
1741 hdr->b_l1hdr.b_l2_hits = 0;
1742
34dc7c2f
BB
1743 buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE);
1744 buf->b_hdr = hdr;
1745 buf->b_data = NULL;
1746 buf->b_efunc = NULL;
1747 buf->b_private = NULL;
1748 buf->b_next = NULL;
b9541d6b
CW
1749
1750 hdr->b_flags = arc_bufc_to_flags(type);
1751 hdr->b_flags |= ARC_FLAG_HAS_L1HDR;
1752
1753 hdr->b_l1hdr.b_buf = buf;
1754 hdr->b_l1hdr.b_state = arc_anon;
1755 hdr->b_l1hdr.b_arc_access = 0;
1756 hdr->b_l1hdr.b_datacnt = 1;
ca0bf58d 1757 hdr->b_l1hdr.b_tmp_cdata = NULL;
b9541d6b 1758
34dc7c2f 1759 arc_get_data_buf(buf);
b9541d6b
CW
1760
1761 ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
1762 (void) refcount_add(&hdr->b_l1hdr.b_refcnt, tag);
34dc7c2f
BB
1763
1764 return (buf);
1765}
1766
9babb374
BB
1767static char *arc_onloan_tag = "onloan";
1768
1769/*
1770 * Loan out an anonymous arc buffer. Loaned buffers are not counted as in
1771 * flight data by arc_tempreserve_space() until they are "returned". Loaned
1772 * buffers must be returned to the arc before they can be used by the DMU or
1773 * freed.
1774 */
1775arc_buf_t *
5f6d0b6f 1776arc_loan_buf(spa_t *spa, uint64_t size)
9babb374
BB
1777{
1778 arc_buf_t *buf;
1779
1780 buf = arc_buf_alloc(spa, size, arc_onloan_tag, ARC_BUFC_DATA);
1781
1782 atomic_add_64(&arc_loaned_bytes, size);
1783 return (buf);
1784}
1785
1786/*
1787 * Return a loaned arc buffer to the arc.
1788 */
1789void
1790arc_return_buf(arc_buf_t *buf, void *tag)
1791{
1792 arc_buf_hdr_t *hdr = buf->b_hdr;
1793
9babb374 1794 ASSERT(buf->b_data != NULL);
b9541d6b
CW
1795 ASSERT(HDR_HAS_L1HDR(hdr));
1796 (void) refcount_add(&hdr->b_l1hdr.b_refcnt, tag);
1797 (void) refcount_remove(&hdr->b_l1hdr.b_refcnt, arc_onloan_tag);
9babb374
BB
1798
1799 atomic_add_64(&arc_loaned_bytes, -hdr->b_size);
1800}
1801
428870ff
BB
1802/* Detach an arc_buf from a dbuf (tag) */
1803void
1804arc_loan_inuse_buf(arc_buf_t *buf, void *tag)
1805{
b9541d6b 1806 arc_buf_hdr_t *hdr = buf->b_hdr;
428870ff
BB
1807
1808 ASSERT(buf->b_data != NULL);
b9541d6b
CW
1809 ASSERT(HDR_HAS_L1HDR(hdr));
1810 (void) refcount_add(&hdr->b_l1hdr.b_refcnt, arc_onloan_tag);
1811 (void) refcount_remove(&hdr->b_l1hdr.b_refcnt, tag);
428870ff
BB
1812 buf->b_efunc = NULL;
1813 buf->b_private = NULL;
1814
1815 atomic_add_64(&arc_loaned_bytes, hdr->b_size);
1816}
1817
34dc7c2f
BB
1818static arc_buf_t *
1819arc_buf_clone(arc_buf_t *from)
1820{
1821 arc_buf_t *buf;
1822 arc_buf_hdr_t *hdr = from->b_hdr;
1823 uint64_t size = hdr->b_size;
1824
b9541d6b
CW
1825 ASSERT(HDR_HAS_L1HDR(hdr));
1826 ASSERT(hdr->b_l1hdr.b_state != arc_anon);
428870ff 1827
34dc7c2f
BB
1828 buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE);
1829 buf->b_hdr = hdr;
1830 buf->b_data = NULL;
1831 buf->b_efunc = NULL;
1832 buf->b_private = NULL;
b9541d6b
CW
1833 buf->b_next = hdr->b_l1hdr.b_buf;
1834 hdr->b_l1hdr.b_buf = buf;
34dc7c2f
BB
1835 arc_get_data_buf(buf);
1836 bcopy(from->b_data, buf->b_data, size);
1eb5bfa3
GW
1837
1838 /*
1839 * This buffer already exists in the arc so create a duplicate
1840 * copy for the caller. If the buffer is associated with user data
1841 * then track the size and number of duplicates. These stats will be
1842 * updated as duplicate buffers are created and destroyed.
1843 */
b9541d6b 1844 if (HDR_ISTYPE_DATA(hdr)) {
1eb5bfa3
GW
1845 ARCSTAT_BUMP(arcstat_duplicate_buffers);
1846 ARCSTAT_INCR(arcstat_duplicate_buffers_size, size);
1847 }
b9541d6b 1848 hdr->b_l1hdr.b_datacnt += 1;
34dc7c2f
BB
1849 return (buf);
1850}
1851
1852void
1853arc_buf_add_ref(arc_buf_t *buf, void* tag)
1854{
1855 arc_buf_hdr_t *hdr;
1856 kmutex_t *hash_lock;
1857
1858 /*
b128c09f
BB
1859 * Check to see if this buffer is evicted. Callers
1860 * must verify b_data != NULL to know if the add_ref
1861 * was successful.
34dc7c2f 1862 */
428870ff 1863 mutex_enter(&buf->b_evict_lock);
b128c09f 1864 if (buf->b_data == NULL) {
428870ff 1865 mutex_exit(&buf->b_evict_lock);
34dc7c2f
BB
1866 return;
1867 }
428870ff 1868 hash_lock = HDR_LOCK(buf->b_hdr);
34dc7c2f 1869 mutex_enter(hash_lock);
428870ff 1870 hdr = buf->b_hdr;
b9541d6b 1871 ASSERT(HDR_HAS_L1HDR(hdr));
428870ff
BB
1872 ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
1873 mutex_exit(&buf->b_evict_lock);
34dc7c2f 1874
b9541d6b
CW
1875 ASSERT(hdr->b_l1hdr.b_state == arc_mru ||
1876 hdr->b_l1hdr.b_state == arc_mfu);
1877
34dc7c2f 1878 add_reference(hdr, hash_lock, tag);
d164b209 1879 DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr);
34dc7c2f
BB
1880 arc_access(hdr, hash_lock);
1881 mutex_exit(hash_lock);
1882 ARCSTAT_BUMP(arcstat_hits);
b9541d6b
CW
1883 ARCSTAT_CONDSTAT(!HDR_PREFETCH(hdr),
1884 demand, prefetch, !HDR_ISTYPE_METADATA(hdr),
34dc7c2f
BB
1885 data, metadata, hits);
1886}
1887
ca0bf58d
PS
1888static void
1889arc_buf_free_on_write(void *data, size_t size,
1890 void (*free_func)(void *, size_t))
1891{
1892 l2arc_data_free_t *df;
1893
1894 df = kmem_alloc(sizeof (*df), KM_SLEEP);
1895 df->l2df_data = data;
1896 df->l2df_size = size;
1897 df->l2df_func = free_func;
1898 mutex_enter(&l2arc_free_on_write_mtx);
1899 list_insert_head(l2arc_free_on_write, df);
1900 mutex_exit(&l2arc_free_on_write_mtx);
1901}
1902
34dc7c2f
BB
1903/*
1904 * Free the arc data buffer. If it is an l2arc write in progress,
1905 * the buffer is placed on l2arc_free_on_write to be freed later.
1906 */
1907static void
498877ba 1908arc_buf_data_free(arc_buf_t *buf, void (*free_func)(void *, size_t))
34dc7c2f 1909{
498877ba
MA
1910 arc_buf_hdr_t *hdr = buf->b_hdr;
1911
34dc7c2f 1912 if (HDR_L2_WRITING(hdr)) {
ca0bf58d 1913 arc_buf_free_on_write(buf->b_data, hdr->b_size, free_func);
34dc7c2f
BB
1914 ARCSTAT_BUMP(arcstat_l2_free_on_write);
1915 } else {
498877ba 1916 free_func(buf->b_data, hdr->b_size);
34dc7c2f
BB
1917 }
1918}
1919
ca0bf58d
PS
1920static void
1921arc_buf_l2_cdata_free(arc_buf_hdr_t *hdr)
1922{
1923 ASSERT(HDR_HAS_L2HDR(hdr));
1924 ASSERT(MUTEX_HELD(&hdr->b_l2hdr.b_dev->l2ad_mtx));
1925
1926 /*
1927 * The b_tmp_cdata field is linked off of the b_l1hdr, so if
1928 * that doesn't exist, the header is in the arc_l2c_only state,
1929 * and there isn't anything to free (it's already been freed).
1930 */
1931 if (!HDR_HAS_L1HDR(hdr))
1932 return;
1933
1934 /*
1935 * The header isn't being written to the l2arc device, thus it
1936 * shouldn't have a b_tmp_cdata to free.
1937 */
1938 if (!HDR_L2_WRITING(hdr)) {
1939 ASSERT3P(hdr->b_l1hdr.b_tmp_cdata, ==, NULL);
1940 return;
1941 }
1942
1943 /*
1944 * The header does not have compression enabled. This can be due
1945 * to the buffer not being compressible, or because we're
1946 * freeing the buffer before the second phase of
1947 * l2arc_write_buffer() has started (which does the compression
1948 * step). In either case, b_tmp_cdata does not point to a
1949 * separately compressed buffer, so there's nothing to free (it
1950 * points to the same buffer as the arc_buf_t's b_data field).
1951 */
1952 if (HDR_GET_COMPRESS(hdr) == ZIO_COMPRESS_OFF) {
1953 hdr->b_l1hdr.b_tmp_cdata = NULL;
1954 return;
1955 }
1956
1957 /*
1958 * There's nothing to free since the buffer was all zero's and
1959 * compressed to a zero length buffer.
1960 */
1961 if (HDR_GET_COMPRESS(hdr) == ZIO_COMPRESS_EMPTY) {
1962 ASSERT3P(hdr->b_l1hdr.b_tmp_cdata, ==, NULL);
1963 return;
1964 }
1965
1966 ASSERT(L2ARC_IS_VALID_COMPRESS(HDR_GET_COMPRESS(hdr)));
1967
1968 arc_buf_free_on_write(hdr->b_l1hdr.b_tmp_cdata,
1969 hdr->b_size, zio_data_buf_free);
1970
1971 ARCSTAT_BUMP(arcstat_l2_cdata_free_on_write);
1972 hdr->b_l1hdr.b_tmp_cdata = NULL;
1973}
1974
bd089c54
MA
1975/*
1976 * Free up buf->b_data and if 'remove' is set, then pull the
1977 * arc_buf_t off of the the arc_buf_hdr_t's list and free it.
1978 */
34dc7c2f 1979static void
ca0bf58d 1980arc_buf_destroy(arc_buf_t *buf, boolean_t remove)
34dc7c2f
BB
1981{
1982 arc_buf_t **bufp;
1983
1984 /* free up data associated with the buf */
b9541d6b
CW
1985 if (buf->b_data != NULL) {
1986 arc_state_t *state = buf->b_hdr->b_l1hdr.b_state;
34dc7c2f 1987 uint64_t size = buf->b_hdr->b_size;
b9541d6b 1988 arc_buf_contents_t type = arc_buf_type(buf->b_hdr);
34dc7c2f
BB
1989
1990 arc_cksum_verify(buf);
498877ba 1991 arc_buf_unwatch(buf);
428870ff 1992
ca0bf58d
PS
1993 if (type == ARC_BUFC_METADATA) {
1994 arc_buf_data_free(buf, zio_buf_free);
1995 arc_space_return(size, ARC_SPACE_META);
1996 } else {
1997 ASSERT(type == ARC_BUFC_DATA);
1998 arc_buf_data_free(buf, zio_data_buf_free);
1999 arc_space_return(size, ARC_SPACE_DATA);
34dc7c2f 2000 }
ca0bf58d
PS
2001
2002 /* protected by hash lock, if in the hash table */
2003 if (multilist_link_active(&buf->b_hdr->b_l1hdr.b_arc_node)) {
34dc7c2f
BB
2004 uint64_t *cnt = &state->arcs_lsize[type];
2005
b9541d6b
CW
2006 ASSERT(refcount_is_zero(
2007 &buf->b_hdr->b_l1hdr.b_refcnt));
2008 ASSERT(state != arc_anon && state != arc_l2c_only);
34dc7c2f
BB
2009
2010 ASSERT3U(*cnt, >=, size);
2011 atomic_add_64(cnt, -size);
2012 }
36da08ef
PS
2013
2014 (void) refcount_remove_many(&state->arcs_size, size, buf);
34dc7c2f 2015 buf->b_data = NULL;
1eb5bfa3
GW
2016
2017 /*
2018 * If we're destroying a duplicate buffer make sure
2019 * that the appropriate statistics are updated.
2020 */
b9541d6b
CW
2021 if (buf->b_hdr->b_l1hdr.b_datacnt > 1 &&
2022 HDR_ISTYPE_DATA(buf->b_hdr)) {
1eb5bfa3
GW
2023 ARCSTAT_BUMPDOWN(arcstat_duplicate_buffers);
2024 ARCSTAT_INCR(arcstat_duplicate_buffers_size, -size);
2025 }
b9541d6b
CW
2026 ASSERT(buf->b_hdr->b_l1hdr.b_datacnt > 0);
2027 buf->b_hdr->b_l1hdr.b_datacnt -= 1;
34dc7c2f
BB
2028 }
2029
2030 /* only remove the buf if requested */
bd089c54 2031 if (!remove)
34dc7c2f
BB
2032 return;
2033
2034 /* remove the buf from the hdr list */
b9541d6b
CW
2035 for (bufp = &buf->b_hdr->b_l1hdr.b_buf; *bufp != buf;
2036 bufp = &(*bufp)->b_next)
34dc7c2f
BB
2037 continue;
2038 *bufp = buf->b_next;
428870ff 2039 buf->b_next = NULL;
34dc7c2f
BB
2040
2041 ASSERT(buf->b_efunc == NULL);
2042
2043 /* clean up the buf */
2044 buf->b_hdr = NULL;
2045 kmem_cache_free(buf_cache, buf);
2046}
2047
d962d5da
PS
2048static void
2049arc_hdr_l2hdr_destroy(arc_buf_hdr_t *hdr)
2050{
2051 l2arc_buf_hdr_t *l2hdr = &hdr->b_l2hdr;
2052 l2arc_dev_t *dev = l2hdr->b_dev;
2053
2054 ASSERT(MUTEX_HELD(&dev->l2ad_mtx));
2055 ASSERT(HDR_HAS_L2HDR(hdr));
2056
2057 list_remove(&dev->l2ad_buflist, hdr);
2058
d962d5da
PS
2059 /*
2060 * We don't want to leak the b_tmp_cdata buffer that was
2061 * allocated in l2arc_write_buffers()
2062 */
2063 arc_buf_l2_cdata_free(hdr);
2064
2065 /*
2066 * If the l2hdr's b_daddr is equal to L2ARC_ADDR_UNSET, then
2067 * this header is being processed by l2arc_write_buffers() (i.e.
2068 * it's in the first stage of l2arc_write_buffers()).
2069 * Re-affirming that truth here, just to serve as a reminder. If
2070 * b_daddr does not equal L2ARC_ADDR_UNSET, then the header may or
2071 * may not have its HDR_L2_WRITING flag set. (the write may have
2072 * completed, in which case HDR_L2_WRITING will be false and the
2073 * b_daddr field will point to the address of the buffer on disk).
2074 */
2075 IMPLY(l2hdr->b_daddr == L2ARC_ADDR_UNSET, HDR_L2_WRITING(hdr));
2076
2077 /*
2078 * If b_daddr is equal to L2ARC_ADDR_UNSET, we're racing with
2079 * l2arc_write_buffers(). Since we've just removed this header
2080 * from the l2arc buffer list, this header will never reach the
2081 * second stage of l2arc_write_buffers(), which increments the
2082 * accounting stats for this header. Thus, we must be careful
2083 * not to decrement them for this header either.
2084 */
2085 if (l2hdr->b_daddr != L2ARC_ADDR_UNSET) {
2086 ARCSTAT_INCR(arcstat_l2_asize, -l2hdr->b_asize);
2087 ARCSTAT_INCR(arcstat_l2_size, -hdr->b_size);
2088
2089 vdev_space_update(dev->l2ad_vdev,
2090 -l2hdr->b_asize, 0, 0);
2091
2092 (void) refcount_remove_many(&dev->l2ad_alloc,
2093 l2hdr->b_asize, hdr);
2094 }
2095
2096 hdr->b_flags &= ~ARC_FLAG_HAS_L2HDR;
2097}
2098
34dc7c2f
BB
2099static void
2100arc_hdr_destroy(arc_buf_hdr_t *hdr)
2101{
b9541d6b
CW
2102 if (HDR_HAS_L1HDR(hdr)) {
2103 ASSERT(hdr->b_l1hdr.b_buf == NULL ||
2104 hdr->b_l1hdr.b_datacnt > 0);
2105 ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
2106 ASSERT3P(hdr->b_l1hdr.b_state, ==, arc_anon);
2107 }
34dc7c2f 2108 ASSERT(!HDR_IO_IN_PROGRESS(hdr));
b9541d6b
CW
2109 ASSERT(!HDR_IN_HASH_TABLE(hdr));
2110
2111 if (HDR_HAS_L2HDR(hdr)) {
d962d5da
PS
2112 l2arc_dev_t *dev = hdr->b_l2hdr.b_dev;
2113 boolean_t buflist_held = MUTEX_HELD(&dev->l2ad_mtx);
428870ff 2114
d962d5da
PS
2115 if (!buflist_held)
2116 mutex_enter(&dev->l2ad_mtx);
b9541d6b 2117
ca0bf58d 2118 /*
d962d5da
PS
2119 * Even though we checked this conditional above, we
2120 * need to check this again now that we have the
2121 * l2ad_mtx. This is because we could be racing with
2122 * another thread calling l2arc_evict() which might have
2123 * destroyed this header's L2 portion as we were waiting
2124 * to acquire the l2ad_mtx. If that happens, we don't
2125 * want to re-destroy the header's L2 portion.
ca0bf58d 2126 */
d962d5da
PS
2127 if (HDR_HAS_L2HDR(hdr))
2128 arc_hdr_l2hdr_destroy(hdr);
428870ff
BB
2129
2130 if (!buflist_held)
d962d5da 2131 mutex_exit(&dev->l2ad_mtx);
34dc7c2f
BB
2132 }
2133
b9541d6b 2134 if (!BUF_EMPTY(hdr))
428870ff 2135 buf_discard_identity(hdr);
b9541d6b 2136
34dc7c2f
BB
2137 if (hdr->b_freeze_cksum != NULL) {
2138 kmem_free(hdr->b_freeze_cksum, sizeof (zio_cksum_t));
2139 hdr->b_freeze_cksum = NULL;
2140 }
2141
b9541d6b
CW
2142 if (HDR_HAS_L1HDR(hdr)) {
2143 while (hdr->b_l1hdr.b_buf) {
2144 arc_buf_t *buf = hdr->b_l1hdr.b_buf;
2145
2146 if (buf->b_efunc != NULL) {
ca0bf58d 2147 mutex_enter(&arc_user_evicts_lock);
b9541d6b
CW
2148 mutex_enter(&buf->b_evict_lock);
2149 ASSERT(buf->b_hdr != NULL);
ca0bf58d 2150 arc_buf_destroy(hdr->b_l1hdr.b_buf, FALSE);
b9541d6b
CW
2151 hdr->b_l1hdr.b_buf = buf->b_next;
2152 buf->b_hdr = &arc_eviction_hdr;
2153 buf->b_next = arc_eviction_list;
2154 arc_eviction_list = buf;
2155 mutex_exit(&buf->b_evict_lock);
ca0bf58d
PS
2156 cv_signal(&arc_user_evicts_cv);
2157 mutex_exit(&arc_user_evicts_lock);
b9541d6b 2158 } else {
ca0bf58d 2159 arc_buf_destroy(hdr->b_l1hdr.b_buf, TRUE);
b9541d6b
CW
2160 }
2161 }
2162 }
2163
34dc7c2f 2164 ASSERT3P(hdr->b_hash_next, ==, NULL);
b9541d6b 2165 if (HDR_HAS_L1HDR(hdr)) {
ca0bf58d 2166 ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node));
b9541d6b
CW
2167 ASSERT3P(hdr->b_l1hdr.b_acb, ==, NULL);
2168 kmem_cache_free(hdr_full_cache, hdr);
2169 } else {
2170 kmem_cache_free(hdr_l2only_cache, hdr);
2171 }
34dc7c2f
BB
2172}
2173
2174void
2175arc_buf_free(arc_buf_t *buf, void *tag)
2176{
2177 arc_buf_hdr_t *hdr = buf->b_hdr;
b9541d6b 2178 int hashed = hdr->b_l1hdr.b_state != arc_anon;
34dc7c2f
BB
2179
2180 ASSERT(buf->b_efunc == NULL);
2181 ASSERT(buf->b_data != NULL);
2182
2183 if (hashed) {
2184 kmutex_t *hash_lock = HDR_LOCK(hdr);
2185
2186 mutex_enter(hash_lock);
428870ff
BB
2187 hdr = buf->b_hdr;
2188 ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
2189
34dc7c2f 2190 (void) remove_reference(hdr, hash_lock, tag);
b9541d6b 2191 if (hdr->b_l1hdr.b_datacnt > 1) {
ca0bf58d 2192 arc_buf_destroy(buf, TRUE);
428870ff 2193 } else {
b9541d6b 2194 ASSERT(buf == hdr->b_l1hdr.b_buf);
428870ff 2195 ASSERT(buf->b_efunc == NULL);
2a432414 2196 hdr->b_flags |= ARC_FLAG_BUF_AVAILABLE;
428870ff 2197 }
34dc7c2f
BB
2198 mutex_exit(hash_lock);
2199 } else if (HDR_IO_IN_PROGRESS(hdr)) {
2200 int destroy_hdr;
2201 /*
2202 * We are in the middle of an async write. Don't destroy
2203 * this buffer unless the write completes before we finish
2204 * decrementing the reference count.
2205 */
ca0bf58d 2206 mutex_enter(&arc_user_evicts_lock);
34dc7c2f 2207 (void) remove_reference(hdr, NULL, tag);
b9541d6b 2208 ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
34dc7c2f 2209 destroy_hdr = !HDR_IO_IN_PROGRESS(hdr);
ca0bf58d 2210 mutex_exit(&arc_user_evicts_lock);
34dc7c2f
BB
2211 if (destroy_hdr)
2212 arc_hdr_destroy(hdr);
2213 } else {
428870ff 2214 if (remove_reference(hdr, NULL, tag) > 0)
ca0bf58d 2215 arc_buf_destroy(buf, TRUE);
428870ff 2216 else
34dc7c2f 2217 arc_hdr_destroy(hdr);
34dc7c2f
BB
2218 }
2219}
2220
13fe0198 2221boolean_t
34dc7c2f
BB
2222arc_buf_remove_ref(arc_buf_t *buf, void* tag)
2223{
2224 arc_buf_hdr_t *hdr = buf->b_hdr;
b4f7f105 2225 kmutex_t *hash_lock = NULL;
13fe0198 2226 boolean_t no_callback = (buf->b_efunc == NULL);
34dc7c2f 2227
b9541d6b
CW
2228 if (hdr->b_l1hdr.b_state == arc_anon) {
2229 ASSERT(hdr->b_l1hdr.b_datacnt == 1);
34dc7c2f
BB
2230 arc_buf_free(buf, tag);
2231 return (no_callback);
2232 }
2233
b4f7f105 2234 hash_lock = HDR_LOCK(hdr);
34dc7c2f 2235 mutex_enter(hash_lock);
428870ff 2236 hdr = buf->b_hdr;
b9541d6b 2237 ASSERT(hdr->b_l1hdr.b_datacnt > 0);
428870ff 2238 ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
b9541d6b 2239 ASSERT(hdr->b_l1hdr.b_state != arc_anon);
34dc7c2f
BB
2240 ASSERT(buf->b_data != NULL);
2241
2242 (void) remove_reference(hdr, hash_lock, tag);
b9541d6b 2243 if (hdr->b_l1hdr.b_datacnt > 1) {
34dc7c2f 2244 if (no_callback)
ca0bf58d 2245 arc_buf_destroy(buf, TRUE);
34dc7c2f 2246 } else if (no_callback) {
b9541d6b 2247 ASSERT(hdr->b_l1hdr.b_buf == buf && buf->b_next == NULL);
428870ff 2248 ASSERT(buf->b_efunc == NULL);
2a432414 2249 hdr->b_flags |= ARC_FLAG_BUF_AVAILABLE;
34dc7c2f 2250 }
b9541d6b
CW
2251 ASSERT(no_callback || hdr->b_l1hdr.b_datacnt > 1 ||
2252 refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
34dc7c2f
BB
2253 mutex_exit(hash_lock);
2254 return (no_callback);
2255}
2256
5f6d0b6f 2257uint64_t
34dc7c2f
BB
2258arc_buf_size(arc_buf_t *buf)
2259{
2260 return (buf->b_hdr->b_size);
2261}
2262
1eb5bfa3
GW
2263/*
2264 * Called from the DMU to determine if the current buffer should be
2265 * evicted. In order to ensure proper locking, the eviction must be initiated
2266 * from the DMU. Return true if the buffer is associated with user data and
2267 * duplicate buffers still exist.
2268 */
2269boolean_t
2270arc_buf_eviction_needed(arc_buf_t *buf)
2271{
2272 arc_buf_hdr_t *hdr;
2273 boolean_t evict_needed = B_FALSE;
2274
2275 if (zfs_disable_dup_eviction)
2276 return (B_FALSE);
2277
2278 mutex_enter(&buf->b_evict_lock);
2279 hdr = buf->b_hdr;
2280 if (hdr == NULL) {
2281 /*
2282 * We are in arc_do_user_evicts(); let that function
2283 * perform the eviction.
2284 */
2285 ASSERT(buf->b_data == NULL);
2286 mutex_exit(&buf->b_evict_lock);
2287 return (B_FALSE);
2288 } else if (buf->b_data == NULL) {
2289 /*
2290 * We have already been added to the arc eviction list;
2291 * recommend eviction.
2292 */
2293 ASSERT3P(hdr, ==, &arc_eviction_hdr);
2294 mutex_exit(&buf->b_evict_lock);
2295 return (B_TRUE);
2296 }
2297
b9541d6b 2298 if (hdr->b_l1hdr.b_datacnt > 1 && HDR_ISTYPE_DATA(hdr))
1eb5bfa3
GW
2299 evict_needed = B_TRUE;
2300
2301 mutex_exit(&buf->b_evict_lock);
2302 return (evict_needed);
2303}
2304
34dc7c2f 2305/*
ca0bf58d
PS
2306 * Evict the arc_buf_hdr that is provided as a parameter. The resultant
2307 * state of the header is dependent on its state prior to entering this
2308 * function. The following transitions are possible:
34dc7c2f 2309 *
ca0bf58d
PS
2310 * - arc_mru -> arc_mru_ghost
2311 * - arc_mfu -> arc_mfu_ghost
2312 * - arc_mru_ghost -> arc_l2c_only
2313 * - arc_mru_ghost -> deleted
2314 * - arc_mfu_ghost -> arc_l2c_only
2315 * - arc_mfu_ghost -> deleted
34dc7c2f 2316 */
ca0bf58d
PS
2317static int64_t
2318arc_evict_hdr(arc_buf_hdr_t *hdr, kmutex_t *hash_lock)
34dc7c2f 2319{
ca0bf58d
PS
2320 arc_state_t *evicted_state, *state;
2321 int64_t bytes_evicted = 0;
34dc7c2f 2322
ca0bf58d
PS
2323 ASSERT(MUTEX_HELD(hash_lock));
2324 ASSERT(HDR_HAS_L1HDR(hdr));
e8b96c60 2325
ca0bf58d
PS
2326 state = hdr->b_l1hdr.b_state;
2327 if (GHOST_STATE(state)) {
2328 ASSERT(!HDR_IO_IN_PROGRESS(hdr));
2329 ASSERT(hdr->b_l1hdr.b_buf == NULL);
e8b96c60
MA
2330
2331 /*
ca0bf58d
PS
2332 * l2arc_write_buffers() relies on a header's L1 portion
2333 * (i.e. its b_tmp_cdata field) during its write phase.
2334 * Thus, we cannot push a header onto the arc_l2c_only
2335 * state (removing its L1 piece) until the header is
2336 * done being written to the l2arc.
e8b96c60 2337 */
ca0bf58d
PS
2338 if (HDR_HAS_L2HDR(hdr) && HDR_L2_WRITING(hdr)) {
2339 ARCSTAT_BUMP(arcstat_evict_l2_skip);
2340 return (bytes_evicted);
e8b96c60
MA
2341 }
2342
ca0bf58d
PS
2343 ARCSTAT_BUMP(arcstat_deleted);
2344 bytes_evicted += hdr->b_size;
428870ff 2345
ca0bf58d 2346 DTRACE_PROBE1(arc__delete, arc_buf_hdr_t *, hdr);
428870ff 2347
ca0bf58d
PS
2348 if (HDR_HAS_L2HDR(hdr)) {
2349 /*
2350 * This buffer is cached on the 2nd Level ARC;
2351 * don't destroy the header.
2352 */
2353 arc_change_state(arc_l2c_only, hdr, hash_lock);
2354 /*
2355 * dropping from L1+L2 cached to L2-only,
2356 * realloc to remove the L1 header.
2357 */
2358 hdr = arc_hdr_realloc(hdr, hdr_full_cache,
2359 hdr_l2only_cache);
34dc7c2f 2360 } else {
ca0bf58d
PS
2361 arc_change_state(arc_anon, hdr, hash_lock);
2362 arc_hdr_destroy(hdr);
34dc7c2f 2363 }
ca0bf58d 2364 return (bytes_evicted);
34dc7c2f
BB
2365 }
2366
ca0bf58d
PS
2367 ASSERT(state == arc_mru || state == arc_mfu);
2368 evicted_state = (state == arc_mru) ? arc_mru_ghost : arc_mfu_ghost;
34dc7c2f 2369
ca0bf58d
PS
2370 /* prefetch buffers have a minimum lifespan */
2371 if (HDR_IO_IN_PROGRESS(hdr) ||
2372 ((hdr->b_flags & (ARC_FLAG_PREFETCH | ARC_FLAG_INDIRECT)) &&
2373 ddi_get_lbolt() - hdr->b_l1hdr.b_arc_access <
2374 arc_min_prefetch_lifespan)) {
2375 ARCSTAT_BUMP(arcstat_evict_skip);
2376 return (bytes_evicted);
da8ccd0e
PS
2377 }
2378
ca0bf58d
PS
2379 ASSERT0(refcount_count(&hdr->b_l1hdr.b_refcnt));
2380 ASSERT3U(hdr->b_l1hdr.b_datacnt, >, 0);
2381 while (hdr->b_l1hdr.b_buf) {
2382 arc_buf_t *buf = hdr->b_l1hdr.b_buf;
2383 if (!mutex_tryenter(&buf->b_evict_lock)) {
2384 ARCSTAT_BUMP(arcstat_mutex_miss);
2385 break;
2386 }
2387 if (buf->b_data != NULL)
2388 bytes_evicted += hdr->b_size;
2389 if (buf->b_efunc != NULL) {
2390 mutex_enter(&arc_user_evicts_lock);
2391 arc_buf_destroy(buf, FALSE);
2392 hdr->b_l1hdr.b_buf = buf->b_next;
2393 buf->b_hdr = &arc_eviction_hdr;
2394 buf->b_next = arc_eviction_list;
2395 arc_eviction_list = buf;
2396 cv_signal(&arc_user_evicts_cv);
2397 mutex_exit(&arc_user_evicts_lock);
2398 mutex_exit(&buf->b_evict_lock);
2399 } else {
2400 mutex_exit(&buf->b_evict_lock);
2401 arc_buf_destroy(buf, TRUE);
2402 }
2403 }
34dc7c2f 2404
ca0bf58d
PS
2405 if (HDR_HAS_L2HDR(hdr)) {
2406 ARCSTAT_INCR(arcstat_evict_l2_cached, hdr->b_size);
2407 } else {
2408 if (l2arc_write_eligible(hdr->b_spa, hdr))
2409 ARCSTAT_INCR(arcstat_evict_l2_eligible, hdr->b_size);
2410 else
2411 ARCSTAT_INCR(arcstat_evict_l2_ineligible, hdr->b_size);
2412 }
34dc7c2f 2413
ca0bf58d
PS
2414 if (hdr->b_l1hdr.b_datacnt == 0) {
2415 arc_change_state(evicted_state, hdr, hash_lock);
2416 ASSERT(HDR_IN_HASH_TABLE(hdr));
2417 hdr->b_flags |= ARC_FLAG_IN_HASH_TABLE;
2418 hdr->b_flags &= ~ARC_FLAG_BUF_AVAILABLE;
2419 DTRACE_PROBE1(arc__evict, arc_buf_hdr_t *, hdr);
2420 }
34dc7c2f 2421
ca0bf58d 2422 return (bytes_evicted);
34dc7c2f
BB
2423}
2424
ca0bf58d
PS
2425static uint64_t
2426arc_evict_state_impl(multilist_t *ml, int idx, arc_buf_hdr_t *marker,
2427 uint64_t spa, int64_t bytes)
34dc7c2f 2428{
ca0bf58d
PS
2429 multilist_sublist_t *mls;
2430 uint64_t bytes_evicted = 0;
2431 arc_buf_hdr_t *hdr;
34dc7c2f 2432 kmutex_t *hash_lock;
ca0bf58d 2433 int evict_count = 0;
34dc7c2f 2434
ca0bf58d
PS
2435 ASSERT3P(marker, !=, NULL);
2436 ASSERTV(if (bytes < 0) ASSERT(bytes == ARC_EVICT_ALL));
2437
2438 mls = multilist_sublist_lock(ml, idx);
572e2857 2439
ca0bf58d
PS
2440 for (hdr = multilist_sublist_prev(mls, marker); hdr != NULL;
2441 hdr = multilist_sublist_prev(mls, marker)) {
2442 if ((bytes != ARC_EVICT_ALL && bytes_evicted >= bytes) ||
2443 (evict_count >= zfs_arc_evict_batch_limit))
2444 break;
2445
2446 /*
2447 * To keep our iteration location, move the marker
2448 * forward. Since we're not holding hdr's hash lock, we
2449 * must be very careful and not remove 'hdr' from the
2450 * sublist. Otherwise, other consumers might mistake the
2451 * 'hdr' as not being on a sublist when they call the
2452 * multilist_link_active() function (they all rely on
2453 * the hash lock protecting concurrent insertions and
2454 * removals). multilist_sublist_move_forward() was
2455 * specifically implemented to ensure this is the case
2456 * (only 'marker' will be removed and re-inserted).
2457 */
2458 multilist_sublist_move_forward(mls, marker);
2459
2460 /*
2461 * The only case where the b_spa field should ever be
2462 * zero, is the marker headers inserted by
2463 * arc_evict_state(). It's possible for multiple threads
2464 * to be calling arc_evict_state() concurrently (e.g.
2465 * dsl_pool_close() and zio_inject_fault()), so we must
2466 * skip any markers we see from these other threads.
2467 */
2a432414 2468 if (hdr->b_spa == 0)
572e2857
BB
2469 continue;
2470
ca0bf58d
PS
2471 /* we're only interested in evicting buffers of a certain spa */
2472 if (spa != 0 && hdr->b_spa != spa) {
2473 ARCSTAT_BUMP(arcstat_evict_skip);
428870ff 2474 continue;
ca0bf58d
PS
2475 }
2476
2477 hash_lock = HDR_LOCK(hdr);
e8b96c60
MA
2478
2479 /*
ca0bf58d
PS
2480 * We aren't calling this function from any code path
2481 * that would already be holding a hash lock, so we're
2482 * asserting on this assumption to be defensive in case
2483 * this ever changes. Without this check, it would be
2484 * possible to incorrectly increment arcstat_mutex_miss
2485 * below (e.g. if the code changed such that we called
2486 * this function with a hash lock held).
e8b96c60 2487 */
ca0bf58d
PS
2488 ASSERT(!MUTEX_HELD(hash_lock));
2489
34dc7c2f 2490 if (mutex_tryenter(hash_lock)) {
ca0bf58d
PS
2491 uint64_t evicted = arc_evict_hdr(hdr, hash_lock);
2492 mutex_exit(hash_lock);
34dc7c2f 2493
ca0bf58d 2494 bytes_evicted += evicted;
34dc7c2f 2495
572e2857 2496 /*
ca0bf58d
PS
2497 * If evicted is zero, arc_evict_hdr() must have
2498 * decided to skip this header, don't increment
2499 * evict_count in this case.
572e2857 2500 */
ca0bf58d
PS
2501 if (evicted != 0)
2502 evict_count++;
2503
2504 /*
2505 * If arc_size isn't overflowing, signal any
2506 * threads that might happen to be waiting.
2507 *
2508 * For each header evicted, we wake up a single
2509 * thread. If we used cv_broadcast, we could
2510 * wake up "too many" threads causing arc_size
2511 * to significantly overflow arc_c; since
2512 * arc_get_data_buf() doesn't check for overflow
2513 * when it's woken up (it doesn't because it's
2514 * possible for the ARC to be overflowing while
2515 * full of un-evictable buffers, and the
2516 * function should proceed in this case).
2517 *
2518 * If threads are left sleeping, due to not
2519 * using cv_broadcast, they will be woken up
2520 * just before arc_reclaim_thread() sleeps.
2521 */
2522 mutex_enter(&arc_reclaim_lock);
2523 if (!arc_is_overflowing())
2524 cv_signal(&arc_reclaim_waiters_cv);
2525 mutex_exit(&arc_reclaim_lock);
e8b96c60 2526 } else {
ca0bf58d 2527 ARCSTAT_BUMP(arcstat_mutex_miss);
e8b96c60 2528 }
34dc7c2f 2529 }
34dc7c2f 2530
ca0bf58d 2531 multilist_sublist_unlock(mls);
34dc7c2f 2532
ca0bf58d 2533 return (bytes_evicted);
34dc7c2f
BB
2534}
2535
ca0bf58d
PS
2536/*
2537 * Evict buffers from the given arc state, until we've removed the
2538 * specified number of bytes. Move the removed buffers to the
2539 * appropriate evict state.
2540 *
2541 * This function makes a "best effort". It skips over any buffers
2542 * it can't get a hash_lock on, and so, may not catch all candidates.
2543 * It may also return without evicting as much space as requested.
2544 *
2545 * If bytes is specified using the special value ARC_EVICT_ALL, this
2546 * will evict all available (i.e. unlocked and evictable) buffers from
2547 * the given arc state; which is used by arc_flush().
2548 */
2549static uint64_t
2550arc_evict_state(arc_state_t *state, uint64_t spa, int64_t bytes,
2551 arc_buf_contents_t type)
34dc7c2f 2552{
ca0bf58d
PS
2553 uint64_t total_evicted = 0;
2554 multilist_t *ml = &state->arcs_list[type];
2555 int num_sublists;
2556 arc_buf_hdr_t **markers;
2557 int i;
2558
2559 ASSERTV(if (bytes < 0) ASSERT(bytes == ARC_EVICT_ALL));
2560
2561 num_sublists = multilist_get_num_sublists(ml);
d164b209
BB
2562
2563 /*
ca0bf58d
PS
2564 * If we've tried to evict from each sublist, made some
2565 * progress, but still have not hit the target number of bytes
2566 * to evict, we want to keep trying. The markers allow us to
2567 * pick up where we left off for each individual sublist, rather
2568 * than starting from the tail each time.
d164b209 2569 */
ca0bf58d
PS
2570 markers = kmem_zalloc(sizeof (*markers) * num_sublists, KM_SLEEP);
2571 for (i = 0; i < num_sublists; i++) {
2572 multilist_sublist_t *mls;
34dc7c2f 2573
ca0bf58d
PS
2574 markers[i] = kmem_cache_alloc(hdr_full_cache, KM_SLEEP);
2575
2576 /*
2577 * A b_spa of 0 is used to indicate that this header is
2578 * a marker. This fact is used in arc_adjust_type() and
2579 * arc_evict_state_impl().
2580 */
2581 markers[i]->b_spa = 0;
34dc7c2f 2582
ca0bf58d
PS
2583 mls = multilist_sublist_lock(ml, i);
2584 multilist_sublist_insert_tail(mls, markers[i]);
2585 multilist_sublist_unlock(mls);
34dc7c2f
BB
2586 }
2587
d164b209 2588 /*
ca0bf58d
PS
2589 * While we haven't hit our target number of bytes to evict, or
2590 * we're evicting all available buffers.
d164b209 2591 */
ca0bf58d
PS
2592 while (total_evicted < bytes || bytes == ARC_EVICT_ALL) {
2593 /*
2594 * Start eviction using a randomly selected sublist,
2595 * this is to try and evenly balance eviction across all
2596 * sublists. Always starting at the same sublist
2597 * (e.g. index 0) would cause evictions to favor certain
2598 * sublists over others.
2599 */
2600 int sublist_idx = multilist_get_random_index(ml);
2601 uint64_t scan_evicted = 0;
34dc7c2f 2602
ca0bf58d
PS
2603 for (i = 0; i < num_sublists; i++) {
2604 uint64_t bytes_remaining;
2605 uint64_t bytes_evicted;
d164b209 2606
ca0bf58d
PS
2607 if (bytes == ARC_EVICT_ALL)
2608 bytes_remaining = ARC_EVICT_ALL;
2609 else if (total_evicted < bytes)
2610 bytes_remaining = bytes - total_evicted;
2611 else
2612 break;
34dc7c2f 2613
ca0bf58d
PS
2614 bytes_evicted = arc_evict_state_impl(ml, sublist_idx,
2615 markers[sublist_idx], spa, bytes_remaining);
2616
2617 scan_evicted += bytes_evicted;
2618 total_evicted += bytes_evicted;
2619
2620 /* we've reached the end, wrap to the beginning */
2621 if (++sublist_idx >= num_sublists)
2622 sublist_idx = 0;
2623 }
2624
2625 /*
2626 * If we didn't evict anything during this scan, we have
2627 * no reason to believe we'll evict more during another
2628 * scan, so break the loop.
2629 */
2630 if (scan_evicted == 0) {
2631 /* This isn't possible, let's make that obvious */
2632 ASSERT3S(bytes, !=, 0);
34dc7c2f 2633
ca0bf58d
PS
2634 /*
2635 * When bytes is ARC_EVICT_ALL, the only way to
2636 * break the loop is when scan_evicted is zero.
2637 * In that case, we actually have evicted enough,
2638 * so we don't want to increment the kstat.
2639 */
2640 if (bytes != ARC_EVICT_ALL) {
2641 ASSERT3S(total_evicted, <, bytes);
2642 ARCSTAT_BUMP(arcstat_evict_not_enough);
2643 }
d164b209 2644
ca0bf58d
PS
2645 break;
2646 }
d164b209 2647 }
34dc7c2f 2648
ca0bf58d
PS
2649 for (i = 0; i < num_sublists; i++) {
2650 multilist_sublist_t *mls = multilist_sublist_lock(ml, i);
2651 multilist_sublist_remove(mls, markers[i]);
2652 multilist_sublist_unlock(mls);
34dc7c2f 2653
ca0bf58d 2654 kmem_cache_free(hdr_full_cache, markers[i]);
34dc7c2f 2655 }
ca0bf58d
PS
2656 kmem_free(markers, sizeof (*markers) * num_sublists);
2657
2658 return (total_evicted);
2659}
2660
2661/*
2662 * Flush all "evictable" data of the given type from the arc state
2663 * specified. This will not evict any "active" buffers (i.e. referenced).
2664 *
2665 * When 'retry' is set to FALSE, the function will make a single pass
2666 * over the state and evict any buffers that it can. Since it doesn't
2667 * continually retry the eviction, it might end up leaving some buffers
2668 * in the ARC due to lock misses.
2669 *
2670 * When 'retry' is set to TRUE, the function will continually retry the
2671 * eviction until *all* evictable buffers have been removed from the
2672 * state. As a result, if concurrent insertions into the state are
2673 * allowed (e.g. if the ARC isn't shutting down), this function might
2674 * wind up in an infinite loop, continually trying to evict buffers.
2675 */
2676static uint64_t
2677arc_flush_state(arc_state_t *state, uint64_t spa, arc_buf_contents_t type,
2678 boolean_t retry)
2679{
2680 uint64_t evicted = 0;
2681
2682 while (state->arcs_lsize[type] != 0) {
2683 evicted += arc_evict_state(state, spa, ARC_EVICT_ALL, type);
2684
2685 if (!retry)
2686 break;
2687 }
2688
2689 return (evicted);
34dc7c2f
BB
2690}
2691
ab26409d 2692/*
f6046738
BB
2693 * Helper function for arc_prune() it is responsible for safely handling
2694 * the execution of a registered arc_prune_func_t.
ab26409d
BB
2695 */
2696static void
f6046738 2697arc_prune_task(void *ptr)
ab26409d 2698{
f6046738
BB
2699 arc_prune_t *ap = (arc_prune_t *)ptr;
2700 arc_prune_func_t *func = ap->p_pfunc;
ab26409d 2701
f6046738
BB
2702 if (func != NULL)
2703 func(ap->p_adjust, ap->p_private);
ab26409d 2704
f6046738
BB
2705 /* Callback unregistered concurrently with execution */
2706 if (refcount_remove(&ap->p_refcnt, func) == 0) {
2707 ASSERT(!list_link_active(&ap->p_node));
2708 refcount_destroy(&ap->p_refcnt);
2709 kmem_free(ap, sizeof (*ap));
2710 }
2711}
ab26409d 2712
f6046738
BB
2713/*
2714 * Notify registered consumers they must drop holds on a portion of the ARC
2715 * buffered they reference. This provides a mechanism to ensure the ARC can
2716 * honor the arc_meta_limit and reclaim otherwise pinned ARC buffers. This
2717 * is analogous to dnlc_reduce_cache() but more generic.
2718 *
2719 * This operation is performed asyncronously so it may be safely called
ca67b33a 2720 * in the context of the arc_reclaim_thread(). A reference is taken here
f6046738
BB
2721 * for each registered arc_prune_t and the arc_prune_task() is responsible
2722 * for releasing it once the registered arc_prune_func_t has completed.
2723 */
2724static void
2725arc_prune_async(int64_t adjust)
2726{
2727 arc_prune_t *ap;
ab26409d 2728
f6046738
BB
2729 mutex_enter(&arc_prune_mtx);
2730 for (ap = list_head(&arc_prune_list); ap != NULL;
2731 ap = list_next(&arc_prune_list, ap)) {
ab26409d 2732
f6046738
BB
2733 if (refcount_count(&ap->p_refcnt) >= 2)
2734 continue;
ab26409d 2735
f6046738
BB
2736 refcount_add(&ap->p_refcnt, ap->p_pfunc);
2737 ap->p_adjust = adjust;
2738 taskq_dispatch(arc_prune_taskq, arc_prune_task, ap, TQ_SLEEP);
2739 ARCSTAT_BUMP(arcstat_prune);
ab26409d 2740 }
ab26409d
BB
2741 mutex_exit(&arc_prune_mtx);
2742}
2743
f6046738
BB
2744static void
2745arc_prune(int64_t adjust)
2746{
2747 arc_prune_async(adjust);
2748 taskq_wait_outstanding(arc_prune_taskq, 0);
2749}
2750
ca0bf58d
PS
2751/*
2752 * Evict the specified number of bytes from the state specified,
2753 * restricting eviction to the spa and type given. This function
2754 * prevents us from trying to evict more from a state's list than
2755 * is "evictable", and to skip evicting altogether when passed a
2756 * negative value for "bytes". In contrast, arc_evict_state() will
2757 * evict everything it can, when passed a negative value for "bytes".
2758 */
2759static uint64_t
2760arc_adjust_impl(arc_state_t *state, uint64_t spa, int64_t bytes,
2761 arc_buf_contents_t type)
2762{
2763 int64_t delta;
2764
2765 if (bytes > 0 && state->arcs_lsize[type] > 0) {
2766 delta = MIN(state->arcs_lsize[type], bytes);
2767 return (arc_evict_state(state, spa, delta, type));
2768 }
2769
2770 return (0);
2771}
2772
2773/*
2774 * The goal of this function is to evict enough meta data buffers from the
2775 * ARC in order to enforce the arc_meta_limit. Achieving this is slightly
2776 * more complicated than it appears because it is common for data buffers
2777 * to have holds on meta data buffers. In addition, dnode meta data buffers
2778 * will be held by the dnodes in the block preventing them from being freed.
2779 * This means we can't simply traverse the ARC and expect to always find
2780 * enough unheld meta data buffer to release.
2781 *
2782 * Therefore, this function has been updated to make alternating passes
2783 * over the ARC releasing data buffers and then newly unheld meta data
2784 * buffers. This ensures forward progress is maintained and arc_meta_used
2785 * will decrease. Normally this is sufficient, but if required the ARC
2786 * will call the registered prune callbacks causing dentry and inodes to
2787 * be dropped from the VFS cache. This will make dnode meta data buffers
2788 * available for reclaim.
2789 */
2790static uint64_t
f6046738 2791arc_adjust_meta_balanced(void)
ca0bf58d
PS
2792{
2793 int64_t adjustmnt, delta, prune = 0;
2794 uint64_t total_evicted = 0;
2795 arc_buf_contents_t type = ARC_BUFC_DATA;
ca67b33a 2796 int restarts = MAX(zfs_arc_meta_adjust_restarts, 0);
ca0bf58d
PS
2797
2798restart:
2799 /*
2800 * This slightly differs than the way we evict from the mru in
2801 * arc_adjust because we don't have a "target" value (i.e. no
2802 * "meta" arc_p). As a result, I think we can completely
2803 * cannibalize the metadata in the MRU before we evict the
2804 * metadata from the MFU. I think we probably need to implement a
2805 * "metadata arc_p" value to do this properly.
2806 */
2807 adjustmnt = arc_meta_used - arc_meta_limit;
2808
2809 if (adjustmnt > 0 && arc_mru->arcs_lsize[type] > 0) {
2810 delta = MIN(arc_mru->arcs_lsize[type], adjustmnt);
2811 total_evicted += arc_adjust_impl(arc_mru, 0, delta, type);
2812 adjustmnt -= delta;
2813 }
2814
2815 /*
2816 * We can't afford to recalculate adjustmnt here. If we do,
2817 * new metadata buffers can sneak into the MRU or ANON lists,
2818 * thus penalize the MFU metadata. Although the fudge factor is
2819 * small, it has been empirically shown to be significant for
2820 * certain workloads (e.g. creating many empty directories). As
2821 * such, we use the original calculation for adjustmnt, and
2822 * simply decrement the amount of data evicted from the MRU.
2823 */
2824
2825 if (adjustmnt > 0 && arc_mfu->arcs_lsize[type] > 0) {
2826 delta = MIN(arc_mfu->arcs_lsize[type], adjustmnt);
2827 total_evicted += arc_adjust_impl(arc_mfu, 0, delta, type);
2828 }
2829
2830 adjustmnt = arc_meta_used - arc_meta_limit;
2831
2832 if (adjustmnt > 0 && arc_mru_ghost->arcs_lsize[type] > 0) {
2833 delta = MIN(adjustmnt,
2834 arc_mru_ghost->arcs_lsize[type]);
2835 total_evicted += arc_adjust_impl(arc_mru_ghost, 0, delta, type);
2836 adjustmnt -= delta;
2837 }
2838
2839 if (adjustmnt > 0 && arc_mfu_ghost->arcs_lsize[type] > 0) {
2840 delta = MIN(adjustmnt,
2841 arc_mfu_ghost->arcs_lsize[type]);
2842 total_evicted += arc_adjust_impl(arc_mfu_ghost, 0, delta, type);
2843 }
2844
2845 /*
2846 * If after attempting to make the requested adjustment to the ARC
2847 * the meta limit is still being exceeded then request that the
2848 * higher layers drop some cached objects which have holds on ARC
2849 * meta buffers. Requests to the upper layers will be made with
2850 * increasingly large scan sizes until the ARC is below the limit.
2851 */
2852 if (arc_meta_used > arc_meta_limit) {
2853 if (type == ARC_BUFC_DATA) {
2854 type = ARC_BUFC_METADATA;
2855 } else {
2856 type = ARC_BUFC_DATA;
2857
2858 if (zfs_arc_meta_prune) {
2859 prune += zfs_arc_meta_prune;
f6046738 2860 arc_prune_async(prune);
ca0bf58d
PS
2861 }
2862 }
2863
2864 if (restarts > 0) {
2865 restarts--;
2866 goto restart;
2867 }
2868 }
2869 return (total_evicted);
2870}
2871
f6046738
BB
2872/*
2873 * Evict metadata buffers from the cache, such that arc_meta_used is
2874 * capped by the arc_meta_limit tunable.
2875 */
2876static uint64_t
2877arc_adjust_meta_only(void)
2878{
2879 uint64_t total_evicted = 0;
2880 int64_t target;
2881
2882 /*
2883 * If we're over the meta limit, we want to evict enough
2884 * metadata to get back under the meta limit. We don't want to
2885 * evict so much that we drop the MRU below arc_p, though. If
2886 * we're over the meta limit more than we're over arc_p, we
2887 * evict some from the MRU here, and some from the MFU below.
2888 */
2889 target = MIN((int64_t)(arc_meta_used - arc_meta_limit),
36da08ef
PS
2890 (int64_t)(refcount_count(&arc_anon->arcs_size) +
2891 refcount_count(&arc_mru->arcs_size) - arc_p));
f6046738
BB
2892
2893 total_evicted += arc_adjust_impl(arc_mru, 0, target, ARC_BUFC_METADATA);
2894
2895 /*
2896 * Similar to the above, we want to evict enough bytes to get us
2897 * below the meta limit, but not so much as to drop us below the
2898 * space alloted to the MFU (which is defined as arc_c - arc_p).
2899 */
2900 target = MIN((int64_t)(arc_meta_used - arc_meta_limit),
36da08ef 2901 (int64_t)(refcount_count(&arc_mfu->arcs_size) - (arc_c - arc_p)));
f6046738
BB
2902
2903 total_evicted += arc_adjust_impl(arc_mfu, 0, target, ARC_BUFC_METADATA);
2904
2905 return (total_evicted);
2906}
2907
2908static uint64_t
2909arc_adjust_meta(void)
2910{
2911 if (zfs_arc_meta_strategy == ARC_STRATEGY_META_ONLY)
2912 return (arc_adjust_meta_only());
2913 else
2914 return (arc_adjust_meta_balanced());
2915}
2916
ca0bf58d
PS
2917/*
2918 * Return the type of the oldest buffer in the given arc state
2919 *
2920 * This function will select a random sublist of type ARC_BUFC_DATA and
2921 * a random sublist of type ARC_BUFC_METADATA. The tail of each sublist
2922 * is compared, and the type which contains the "older" buffer will be
2923 * returned.
2924 */
2925static arc_buf_contents_t
2926arc_adjust_type(arc_state_t *state)
2927{
2928 multilist_t *data_ml = &state->arcs_list[ARC_BUFC_DATA];
2929 multilist_t *meta_ml = &state->arcs_list[ARC_BUFC_METADATA];
2930 int data_idx = multilist_get_random_index(data_ml);
2931 int meta_idx = multilist_get_random_index(meta_ml);
2932 multilist_sublist_t *data_mls;
2933 multilist_sublist_t *meta_mls;
2934 arc_buf_contents_t type;
2935 arc_buf_hdr_t *data_hdr;
2936 arc_buf_hdr_t *meta_hdr;
2937
2938 /*
2939 * We keep the sublist lock until we're finished, to prevent
2940 * the headers from being destroyed via arc_evict_state().
2941 */
2942 data_mls = multilist_sublist_lock(data_ml, data_idx);
2943 meta_mls = multilist_sublist_lock(meta_ml, meta_idx);
2944
2945 /*
2946 * These two loops are to ensure we skip any markers that
2947 * might be at the tail of the lists due to arc_evict_state().
2948 */
2949
2950 for (data_hdr = multilist_sublist_tail(data_mls); data_hdr != NULL;
2951 data_hdr = multilist_sublist_prev(data_mls, data_hdr)) {
2952 if (data_hdr->b_spa != 0)
2953 break;
2954 }
2955
2956 for (meta_hdr = multilist_sublist_tail(meta_mls); meta_hdr != NULL;
2957 meta_hdr = multilist_sublist_prev(meta_mls, meta_hdr)) {
2958 if (meta_hdr->b_spa != 0)
2959 break;
2960 }
2961
2962 if (data_hdr == NULL && meta_hdr == NULL) {
2963 type = ARC_BUFC_DATA;
2964 } else if (data_hdr == NULL) {
2965 ASSERT3P(meta_hdr, !=, NULL);
2966 type = ARC_BUFC_METADATA;
2967 } else if (meta_hdr == NULL) {
2968 ASSERT3P(data_hdr, !=, NULL);
2969 type = ARC_BUFC_DATA;
2970 } else {
2971 ASSERT3P(data_hdr, !=, NULL);
2972 ASSERT3P(meta_hdr, !=, NULL);
2973
2974 /* The headers can't be on the sublist without an L1 header */
2975 ASSERT(HDR_HAS_L1HDR(data_hdr));
2976 ASSERT(HDR_HAS_L1HDR(meta_hdr));
2977
2978 if (data_hdr->b_l1hdr.b_arc_access <
2979 meta_hdr->b_l1hdr.b_arc_access) {
2980 type = ARC_BUFC_DATA;
2981 } else {
2982 type = ARC_BUFC_METADATA;
2983 }
2984 }
2985
2986 multilist_sublist_unlock(meta_mls);
2987 multilist_sublist_unlock(data_mls);
2988
2989 return (type);
2990}
2991
2992/*
2993 * Evict buffers from the cache, such that arc_size is capped by arc_c.
2994 */
2995static uint64_t
2996arc_adjust(void)
2997{
2998 uint64_t total_evicted = 0;
2999 uint64_t bytes;
3000 int64_t target;
3001
3002 /*
3003 * If we're over arc_meta_limit, we want to correct that before
3004 * potentially evicting data buffers below.
3005 */
3006 total_evicted += arc_adjust_meta();
3007
3008 /*
3009 * Adjust MRU size
3010 *
3011 * If we're over the target cache size, we want to evict enough
3012 * from the list to get back to our target size. We don't want
3013 * to evict too much from the MRU, such that it drops below
3014 * arc_p. So, if we're over our target cache size more than
3015 * the MRU is over arc_p, we'll evict enough to get back to
3016 * arc_p here, and then evict more from the MFU below.
3017 */
3018 target = MIN((int64_t)(arc_size - arc_c),
36da08ef
PS
3019 (int64_t)(refcount_count(&arc_anon->arcs_size) +
3020 refcount_count(&arc_mru->arcs_size) + arc_meta_used - arc_p));
ca0bf58d
PS
3021
3022 /*
3023 * If we're below arc_meta_min, always prefer to evict data.
3024 * Otherwise, try to satisfy the requested number of bytes to
3025 * evict from the type which contains older buffers; in an
3026 * effort to keep newer buffers in the cache regardless of their
3027 * type. If we cannot satisfy the number of bytes from this
3028 * type, spill over into the next type.
3029 */
3030 if (arc_adjust_type(arc_mru) == ARC_BUFC_METADATA &&
3031 arc_meta_used > arc_meta_min) {
3032 bytes = arc_adjust_impl(arc_mru, 0, target, ARC_BUFC_METADATA);
3033 total_evicted += bytes;
3034
3035 /*
3036 * If we couldn't evict our target number of bytes from
3037 * metadata, we try to get the rest from data.
3038 */
3039 target -= bytes;
3040
3041 total_evicted +=
3042 arc_adjust_impl(arc_mru, 0, target, ARC_BUFC_DATA);
3043 } else {
3044 bytes = arc_adjust_impl(arc_mru, 0, target, ARC_BUFC_DATA);
3045 total_evicted += bytes;
3046
3047 /*
3048 * If we couldn't evict our target number of bytes from
3049 * data, we try to get the rest from metadata.
3050 */
3051 target -= bytes;
3052
3053 total_evicted +=
3054 arc_adjust_impl(arc_mru, 0, target, ARC_BUFC_METADATA);
3055 }
3056
3057 /*
3058 * Adjust MFU size
3059 *
3060 * Now that we've tried to evict enough from the MRU to get its
3061 * size back to arc_p, if we're still above the target cache
3062 * size, we evict the rest from the MFU.
3063 */
3064 target = arc_size - arc_c;
3065
a7b10a93 3066 if (arc_adjust_type(arc_mfu) == ARC_BUFC_METADATA &&
ca0bf58d
PS
3067 arc_meta_used > arc_meta_min) {
3068 bytes = arc_adjust_impl(arc_mfu, 0, target, ARC_BUFC_METADATA);
3069 total_evicted += bytes;
3070
3071 /*
3072 * If we couldn't evict our target number of bytes from
3073 * metadata, we try to get the rest from data.
3074 */
3075 target -= bytes;
3076
3077 total_evicted +=
3078 arc_adjust_impl(arc_mfu, 0, target, ARC_BUFC_DATA);
3079 } else {
3080 bytes = arc_adjust_impl(arc_mfu, 0, target, ARC_BUFC_DATA);
3081 total_evicted += bytes;
3082
3083 /*
3084 * If we couldn't evict our target number of bytes from
3085 * data, we try to get the rest from data.
3086 */
3087 target -= bytes;
3088
3089 total_evicted +=
3090 arc_adjust_impl(arc_mfu, 0, target, ARC_BUFC_METADATA);
3091 }
3092
3093 /*
3094 * Adjust ghost lists
3095 *
3096 * In addition to the above, the ARC also defines target values
3097 * for the ghost lists. The sum of the mru list and mru ghost
3098 * list should never exceed the target size of the cache, and
3099 * the sum of the mru list, mfu list, mru ghost list, and mfu
3100 * ghost list should never exceed twice the target size of the
3101 * cache. The following logic enforces these limits on the ghost
3102 * caches, and evicts from them as needed.
3103 */
36da08ef
PS
3104 target = refcount_count(&arc_mru->arcs_size) +
3105 refcount_count(&arc_mru_ghost->arcs_size) - arc_c;
ca0bf58d
PS
3106
3107 bytes = arc_adjust_impl(arc_mru_ghost, 0, target, ARC_BUFC_DATA);
3108 total_evicted += bytes;
3109
3110 target -= bytes;
3111
3112 total_evicted +=
3113 arc_adjust_impl(arc_mru_ghost, 0, target, ARC_BUFC_METADATA);
3114
3115 /*
3116 * We assume the sum of the mru list and mfu list is less than
3117 * or equal to arc_c (we enforced this above), which means we
3118 * can use the simpler of the two equations below:
3119 *
3120 * mru + mfu + mru ghost + mfu ghost <= 2 * arc_c
3121 * mru ghost + mfu ghost <= arc_c
3122 */
36da08ef
PS
3123 target = refcount_count(&arc_mru_ghost->arcs_size) +
3124 refcount_count(&arc_mfu_ghost->arcs_size) - arc_c;
ca0bf58d
PS
3125
3126 bytes = arc_adjust_impl(arc_mfu_ghost, 0, target, ARC_BUFC_DATA);
3127 total_evicted += bytes;
3128
3129 target -= bytes;
3130
3131 total_evicted +=
3132 arc_adjust_impl(arc_mfu_ghost, 0, target, ARC_BUFC_METADATA);
3133
3134 return (total_evicted);
3135}
3136
34dc7c2f
BB
3137static void
3138arc_do_user_evicts(void)
3139{
ca0bf58d 3140 mutex_enter(&arc_user_evicts_lock);
34dc7c2f
BB
3141 while (arc_eviction_list != NULL) {
3142 arc_buf_t *buf = arc_eviction_list;
3143 arc_eviction_list = buf->b_next;
428870ff 3144 mutex_enter(&buf->b_evict_lock);
34dc7c2f 3145 buf->b_hdr = NULL;
428870ff 3146 mutex_exit(&buf->b_evict_lock);
ca0bf58d 3147 mutex_exit(&arc_user_evicts_lock);
34dc7c2f
BB
3148
3149 if (buf->b_efunc != NULL)
bd089c54 3150 VERIFY0(buf->b_efunc(buf->b_private));
34dc7c2f
BB
3151
3152 buf->b_efunc = NULL;
3153 buf->b_private = NULL;
3154 kmem_cache_free(buf_cache, buf);
ca0bf58d 3155 mutex_enter(&arc_user_evicts_lock);
34dc7c2f 3156 }
ca0bf58d 3157 mutex_exit(&arc_user_evicts_lock);
34dc7c2f
BB
3158}
3159
ca0bf58d
PS
3160void
3161arc_flush(spa_t *spa, boolean_t retry)
ab26409d 3162{
ca0bf58d 3163 uint64_t guid = 0;
94520ca4 3164
bc888666 3165 /*
ca0bf58d
PS
3166 * If retry is TRUE, a spa must not be specified since we have
3167 * no good way to determine if all of a spa's buffers have been
3168 * evicted from an arc state.
bc888666 3169 */
ca0bf58d 3170 ASSERT(!retry || spa == 0);
d164b209 3171
b9541d6b 3172 if (spa != NULL)
3541dc6d 3173 guid = spa_load_guid(spa);
d164b209 3174
ca0bf58d
PS
3175 (void) arc_flush_state(arc_mru, guid, ARC_BUFC_DATA, retry);
3176 (void) arc_flush_state(arc_mru, guid, ARC_BUFC_METADATA, retry);
3177
3178 (void) arc_flush_state(arc_mfu, guid, ARC_BUFC_DATA, retry);
3179 (void) arc_flush_state(arc_mfu, guid, ARC_BUFC_METADATA, retry);
3180
3181 (void) arc_flush_state(arc_mru_ghost, guid, ARC_BUFC_DATA, retry);
3182 (void) arc_flush_state(arc_mru_ghost, guid, ARC_BUFC_METADATA, retry);
34dc7c2f 3183
ca0bf58d
PS
3184 (void) arc_flush_state(arc_mfu_ghost, guid, ARC_BUFC_DATA, retry);
3185 (void) arc_flush_state(arc_mfu_ghost, guid, ARC_BUFC_METADATA, retry);
34dc7c2f 3186
34dc7c2f 3187 arc_do_user_evicts();
34dc7c2f
BB
3188 ASSERT(spa || arc_eviction_list == NULL);
3189}
3190
34dc7c2f 3191void
ca67b33a 3192arc_shrink(int64_t to_free)
34dc7c2f
BB
3193{
3194 if (arc_c > arc_c_min) {
302f753f 3195
34dc7c2f
BB
3196 if (arc_c > arc_c_min + to_free)
3197 atomic_add_64(&arc_c, -to_free);
3198 else
3199 arc_c = arc_c_min;
3200
ca67b33a 3201 atomic_add_64(&arc_p, -(arc_p >> arc_shrink_shift));
34dc7c2f
BB
3202 if (arc_c > arc_size)
3203 arc_c = MAX(arc_size, arc_c_min);
3204 if (arc_p > arc_c)
3205 arc_p = (arc_c >> 1);
3206 ASSERT(arc_c >= arc_c_min);
3207 ASSERT((int64_t)arc_p >= 0);
3208 }
3209
3210 if (arc_size > arc_c)
ca0bf58d 3211 (void) arc_adjust();
34dc7c2f
BB
3212}
3213
ca67b33a
MA
3214typedef enum free_memory_reason_t {
3215 FMR_UNKNOWN,
3216 FMR_NEEDFREE,
3217 FMR_LOTSFREE,
3218 FMR_SWAPFS_MINFREE,
3219 FMR_PAGES_PP_MAXIMUM,
3220 FMR_HEAP_ARENA,
3221 FMR_ZIO_ARENA,
3222} free_memory_reason_t;
3223
3224int64_t last_free_memory;
3225free_memory_reason_t last_free_reason;
3226
3227#ifdef _KERNEL
3228#ifdef __linux__
3229/*
3230 * expiration time for arc_no_grow set by direct memory reclaim.
3231 */
3232static clock_t arc_grow_time = 0;
3233#else
3234/*
3235 * Additional reserve of pages for pp_reserve.
3236 */
3237int64_t arc_pages_pp_reserve = 64;
3238
3239/*
3240 * Additional reserve of pages for swapfs.
3241 */
3242int64_t arc_swapfs_reserve = 64;
3243#endif
3244#endif /* _KERNEL */
3245
3246/*
3247 * Return the amount of memory that can be consumed before reclaim will be
3248 * needed. Positive if there is sufficient free memory, negative indicates
3249 * the amount of memory that needs to be freed up.
3250 */
3251static int64_t
3252arc_available_memory(void)
3253{
3254 int64_t lowest = INT64_MAX;
3255 free_memory_reason_t r = FMR_UNKNOWN;
3256
3257#ifdef _KERNEL
3258#ifdef __linux__
3259 /*
3260 * Under Linux we are not allowed to directly interrogate the global
3261 * memory state. Instead rely on observing that direct reclaim has
3262 * recently occurred therefore the system must be low on memory. The
3263 * exact values returned are not critical but should be small.
3264 */
3265 if (ddi_time_after_eq(ddi_get_lbolt(), arc_grow_time))
3266 lowest = PAGE_SIZE;
3267 else
3268 lowest = -PAGE_SIZE;
3269#else
3270 int64_t n;
3271
3272 /*
3273 * Platforms like illumos have greater visibility in to the memory
3274 * subsystem and can return a more detailed analysis of memory.
3275 */
3276 if (needfree > 0) {
3277 n = PAGESIZE * (-needfree);
3278 if (n < lowest) {
3279 lowest = n;
3280 r = FMR_NEEDFREE;
3281 }
3282 }
3283
3284 /*
3285 * check that we're out of range of the pageout scanner. It starts to
3286 * schedule paging if freemem is less than lotsfree and needfree.
3287 * lotsfree is the high-water mark for pageout, and needfree is the
3288 * number of needed free pages. We add extra pages here to make sure
3289 * the scanner doesn't start up while we're freeing memory.
3290 */
3291 n = PAGESIZE * (freemem - lotsfree - needfree - desfree);
3292 if (n < lowest) {
3293 lowest = n;
3294 r = FMR_LOTSFREE;
3295 }
3296
3297 /*
3298 * check to make sure that swapfs has enough space so that anon
3299 * reservations can still succeed. anon_resvmem() checks that the
3300 * availrmem is greater than swapfs_minfree, and the number of reserved
3301 * swap pages. We also add a bit of extra here just to prevent
3302 * circumstances from getting really dire.
3303 */
3304 n = PAGESIZE * (availrmem - swapfs_minfree - swapfs_reserve -
3305 desfree - arc_swapfs_reserve);
3306 if (n < lowest) {
3307 lowest = n;
3308 r = FMR_SWAPFS_MINFREE;
3309 }
3310
3311
3312 /*
3313 * Check that we have enough availrmem that memory locking (e.g., via
3314 * mlock(3C) or memcntl(2)) can still succeed. (pages_pp_maximum
3315 * stores the number of pages that cannot be locked; when availrmem
3316 * drops below pages_pp_maximum, page locking mechanisms such as
3317 * page_pp_lock() will fail.)
3318 */
3319 n = PAGESIZE * (availrmem - pages_pp_maximum -
3320 arc_pages_pp_reserve);
3321 if (n < lowest) {
3322 lowest = n;
3323 r = FMR_PAGES_PP_MAXIMUM;
3324 }
3325
3326#if defined(__i386)
3327 /*
3328 * If we're on an i386 platform, it's possible that we'll exhaust the
3329 * kernel heap space before we ever run out of available physical
3330 * memory. Most checks of the size of the heap_area compare against
3331 * tune.t_minarmem, which is the minimum available real memory that we
3332 * can have in the system. However, this is generally fixed at 25 pages
3333 * which is so low that it's useless. In this comparison, we seek to
3334 * calculate the total heap-size, and reclaim if more than 3/4ths of the
3335 * heap is allocated. (Or, in the calculation, if less than 1/4th is
3336 * free)
3337 */
3338 n = vmem_size(heap_arena, VMEM_FREE) -
3339 (vmem_size(heap_arena, VMEM_FREE | VMEM_ALLOC) >> 2);
3340 if (n < lowest) {
3341 lowest = n;
3342 r = FMR_HEAP_ARENA;
3343 }
3344#endif
3345
3346 /*
3347 * If zio data pages are being allocated out of a separate heap segment,
3348 * then enforce that the size of available vmem for this arena remains
3349 * above about 1/16th free.
3350 *
3351 * Note: The 1/16th arena free requirement was put in place
3352 * to aggressively evict memory from the arc in order to avoid
3353 * memory fragmentation issues.
3354 */
3355 if (zio_arena != NULL) {
3356 n = vmem_size(zio_arena, VMEM_FREE) -
3357 (vmem_size(zio_arena, VMEM_ALLOC) >> 4);
3358 if (n < lowest) {
3359 lowest = n;
3360 r = FMR_ZIO_ARENA;
3361 }
3362 }
3363#endif /* __linux__ */
3364#else
3365 /* Every 100 calls, free a small amount */
3366 if (spa_get_random(100) == 0)
3367 lowest = -1024;
3368#endif
3369
3370 last_free_memory = lowest;
3371 last_free_reason = r;
3372
3373 return (lowest);
3374}
3375
3376/*
3377 * Determine if the system is under memory pressure and is asking
3378 * to reclaim memory. A return value of TRUE indicates that the system
3379 * is under memory pressure and that the arc should adjust accordingly.
3380 */
3381static boolean_t
3382arc_reclaim_needed(void)
3383{
3384 return (arc_available_memory() < 0);
3385}
3386
34dc7c2f 3387static void
ca67b33a 3388arc_kmem_reap_now(void)
34dc7c2f
BB
3389{
3390 size_t i;
3391 kmem_cache_t *prev_cache = NULL;
3392 kmem_cache_t *prev_data_cache = NULL;
3393 extern kmem_cache_t *zio_buf_cache[];
3394 extern kmem_cache_t *zio_data_buf_cache[];
669dedb3 3395 extern kmem_cache_t *range_seg_cache;
34dc7c2f 3396
f6046738
BB
3397 if ((arc_meta_used >= arc_meta_limit) && zfs_arc_meta_prune) {
3398 /*
3399 * We are exceeding our meta-data cache limit.
3400 * Prune some entries to release holds on meta-data.
3401 */
3402 arc_prune(zfs_arc_meta_prune);
3403 }
3404
34dc7c2f
BB
3405 for (i = 0; i < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; i++) {
3406 if (zio_buf_cache[i] != prev_cache) {
3407 prev_cache = zio_buf_cache[i];
3408 kmem_cache_reap_now(zio_buf_cache[i]);
3409 }
3410 if (zio_data_buf_cache[i] != prev_data_cache) {
3411 prev_data_cache = zio_data_buf_cache[i];
3412 kmem_cache_reap_now(zio_data_buf_cache[i]);
3413 }
3414 }
ca0bf58d 3415 kmem_cache_reap_now(buf_cache);
b9541d6b
CW
3416 kmem_cache_reap_now(hdr_full_cache);
3417 kmem_cache_reap_now(hdr_l2only_cache);
669dedb3 3418 kmem_cache_reap_now(range_seg_cache);
ca67b33a
MA
3419
3420 if (zio_arena != NULL) {
3421 /*
3422 * Ask the vmem arena to reclaim unused memory from its
3423 * quantum caches.
3424 */
3425 vmem_qcache_reap(zio_arena);
3426 }
34dc7c2f
BB
3427}
3428
302f753f 3429/*
ca0bf58d
PS
3430 * Threads can block in arc_get_data_buf() waiting for this thread to evict
3431 * enough data and signal them to proceed. When this happens, the threads in
3432 * arc_get_data_buf() are sleeping while holding the hash lock for their
3433 * particular arc header. Thus, we must be careful to never sleep on a
3434 * hash lock in this thread. This is to prevent the following deadlock:
3435 *
3436 * - Thread A sleeps on CV in arc_get_data_buf() holding hash lock "L",
3437 * waiting for the reclaim thread to signal it.
3438 *
3439 * - arc_reclaim_thread() tries to acquire hash lock "L" using mutex_enter,
3440 * fails, and goes to sleep forever.
3441 *
3442 * This possible deadlock is avoided by always acquiring a hash lock
3443 * using mutex_tryenter() from arc_reclaim_thread().
302f753f 3444 */
34dc7c2f 3445static void
ca67b33a 3446arc_reclaim_thread(void)
34dc7c2f 3447{
ca67b33a
MA
3448 fstrans_cookie_t cookie = spl_fstrans_mark();
3449 clock_t growtime = 0;
34dc7c2f
BB
3450 callb_cpr_t cpr;
3451
ca0bf58d 3452 CALLB_CPR_INIT(&cpr, &arc_reclaim_lock, callb_generic_cpr, FTAG);
34dc7c2f 3453
ca0bf58d 3454 mutex_enter(&arc_reclaim_lock);
ca67b33a
MA
3455 while (!arc_reclaim_thread_exit) {
3456 int64_t to_free;
3457 int64_t free_memory = arc_available_memory();
3458 uint64_t evicted = 0;
302f753f 3459
ca67b33a 3460 arc_tuning_update();
34dc7c2f 3461
ca67b33a 3462 mutex_exit(&arc_reclaim_lock);
34dc7c2f 3463
ca67b33a 3464 if (free_memory < 0) {
34dc7c2f 3465
ca67b33a 3466 arc_no_grow = B_TRUE;
b128c09f 3467 arc_warm = B_TRUE;
34dc7c2f 3468
ca67b33a
MA
3469 /*
3470 * Wait at least zfs_grow_retry (default 5) seconds
3471 * before considering growing.
3472 */
3473 growtime = ddi_get_lbolt() + (arc_grow_retry * hz);
6a8f9b6b 3474
ca67b33a 3475 arc_kmem_reap_now();
34dc7c2f 3476
ca67b33a
MA
3477 /*
3478 * If we are still low on memory, shrink the ARC
3479 * so that we have arc_shrink_min free space.
3480 */
3481 free_memory = arc_available_memory();
34dc7c2f 3482
ca67b33a
MA
3483 to_free = (arc_c >> arc_shrink_shift) - free_memory;
3484 if (to_free > 0) {
3485#ifdef _KERNEL
3486 to_free = MAX(to_free, ptob(needfree));
3487#endif
3488 arc_shrink(to_free);
3489 }
3490 } else if (free_memory < arc_c >> arc_no_grow_shift) {
3491 arc_no_grow = B_TRUE;
3492 } else if (ddi_get_lbolt() >= growtime) {
3493 arc_no_grow = B_FALSE;
3494 }
bce45ec9 3495
ca67b33a 3496 evicted = arc_adjust();
bce45ec9 3497
ca67b33a 3498 mutex_enter(&arc_reclaim_lock);
bce45ec9 3499
ca67b33a
MA
3500 /*
3501 * If evicted is zero, we couldn't evict anything via
3502 * arc_adjust(). This could be due to hash lock
3503 * collisions, but more likely due to the majority of
3504 * arc buffers being unevictable. Therefore, even if
3505 * arc_size is above arc_c, another pass is unlikely to
3506 * be helpful and could potentially cause us to enter an
3507 * infinite loop.
3508 */
3509 if (arc_size <= arc_c || evicted == 0) {
3510 /*
3511 * We're either no longer overflowing, or we
3512 * can't evict anything more, so we should wake
3513 * up any threads before we go to sleep.
3514 */
3515 cv_broadcast(&arc_reclaim_waiters_cv);
bce45ec9 3516
ca67b33a
MA
3517 /*
3518 * Block until signaled, or after one second (we
3519 * might need to perform arc_kmem_reap_now()
3520 * even if we aren't being signalled)
3521 */
3522 CALLB_CPR_SAFE_BEGIN(&cpr);
3523 (void) cv_timedwait_sig(&arc_reclaim_thread_cv,
3524 &arc_reclaim_lock, ddi_get_lbolt() + hz);
3525 CALLB_CPR_SAFE_END(&cpr, &arc_reclaim_lock);
3526 }
ca0bf58d 3527 }
bce45ec9 3528
ca67b33a 3529 arc_reclaim_thread_exit = FALSE;
ca0bf58d
PS
3530 cv_broadcast(&arc_reclaim_thread_cv);
3531 CALLB_CPR_EXIT(&cpr); /* drops arc_reclaim_lock */
3532 spl_fstrans_unmark(cookie);
3533 thread_exit();
3534}
3535
3536static void
3537arc_user_evicts_thread(void)
3538{
ca67b33a 3539 fstrans_cookie_t cookie = spl_fstrans_mark();
ca0bf58d 3540 callb_cpr_t cpr;
bce45ec9 3541
ca0bf58d 3542 CALLB_CPR_INIT(&cpr, &arc_user_evicts_lock, callb_generic_cpr, FTAG);
bce45ec9 3543
ca0bf58d
PS
3544 mutex_enter(&arc_user_evicts_lock);
3545 while (!arc_user_evicts_thread_exit) {
3546 mutex_exit(&arc_user_evicts_lock);
3547
3548 arc_do_user_evicts();
3549
3550 /*
3551 * This is necessary in order for the mdb ::arc dcmd to
3552 * show up to date information. Since the ::arc command
3553 * does not call the kstat's update function, without
3554 * this call, the command may show stale stats for the
3555 * anon, mru, mru_ghost, mfu, and mfu_ghost lists. Even
3556 * with this change, the data might be up to 1 second
3557 * out of date; but that should suffice. The arc_state_t
3558 * structures can be queried directly if more accurate
3559 * information is needed.
3560 */
3561 if (arc_ksp != NULL)
3562 arc_ksp->ks_update(arc_ksp, KSTAT_READ);
3563
3564 mutex_enter(&arc_user_evicts_lock);
3565
3566 /*
3567 * Block until signaled, or after one second (we need to
3568 * call the arc's kstat update function regularly).
3569 */
3570 CALLB_CPR_SAFE_BEGIN(&cpr);
b64ccd6c 3571 (void) cv_timedwait_sig(&arc_user_evicts_cv,
ca0bf58d
PS
3572 &arc_user_evicts_lock, ddi_get_lbolt() + hz);
3573 CALLB_CPR_SAFE_END(&cpr, &arc_user_evicts_lock);
34dc7c2f
BB
3574 }
3575
ca0bf58d
PS
3576 arc_user_evicts_thread_exit = FALSE;
3577 cv_broadcast(&arc_user_evicts_cv);
3578 CALLB_CPR_EXIT(&cpr); /* drops arc_user_evicts_lock */
40d06e3c 3579 spl_fstrans_unmark(cookie);
34dc7c2f
BB
3580 thread_exit();
3581}
3582
7cb67b45
BB
3583#ifdef _KERNEL
3584/*
302f753f
BB
3585 * Determine the amount of memory eligible for eviction contained in the
3586 * ARC. All clean data reported by the ghost lists can always be safely
3587 * evicted. Due to arc_c_min, the same does not hold for all clean data
3588 * contained by the regular mru and mfu lists.
3589 *
3590 * In the case of the regular mru and mfu lists, we need to report as
3591 * much clean data as possible, such that evicting that same reported
3592 * data will not bring arc_size below arc_c_min. Thus, in certain
3593 * circumstances, the total amount of clean data in the mru and mfu
3594 * lists might not actually be evictable.
3595 *
3596 * The following two distinct cases are accounted for:
3597 *
3598 * 1. The sum of the amount of dirty data contained by both the mru and
3599 * mfu lists, plus the ARC's other accounting (e.g. the anon list),
3600 * is greater than or equal to arc_c_min.
3601 * (i.e. amount of dirty data >= arc_c_min)
3602 *
3603 * This is the easy case; all clean data contained by the mru and mfu
3604 * lists is evictable. Evicting all clean data can only drop arc_size
3605 * to the amount of dirty data, which is greater than arc_c_min.
3606 *
3607 * 2. The sum of the amount of dirty data contained by both the mru and
3608 * mfu lists, plus the ARC's other accounting (e.g. the anon list),
3609 * is less than arc_c_min.
3610 * (i.e. arc_c_min > amount of dirty data)
3611 *
3612 * 2.1. arc_size is greater than or equal arc_c_min.
3613 * (i.e. arc_size >= arc_c_min > amount of dirty data)
3614 *
3615 * In this case, not all clean data from the regular mru and mfu
3616 * lists is actually evictable; we must leave enough clean data
3617 * to keep arc_size above arc_c_min. Thus, the maximum amount of
3618 * evictable data from the two lists combined, is exactly the
3619 * difference between arc_size and arc_c_min.
3620 *
3621 * 2.2. arc_size is less than arc_c_min
3622 * (i.e. arc_c_min > arc_size > amount of dirty data)
3623 *
3624 * In this case, none of the data contained in the mru and mfu
3625 * lists is evictable, even if it's clean. Since arc_size is
3626 * already below arc_c_min, evicting any more would only
3627 * increase this negative difference.
7cb67b45 3628 */
302f753f
BB
3629static uint64_t
3630arc_evictable_memory(void) {
3631 uint64_t arc_clean =
3632 arc_mru->arcs_lsize[ARC_BUFC_DATA] +
3633 arc_mru->arcs_lsize[ARC_BUFC_METADATA] +
3634 arc_mfu->arcs_lsize[ARC_BUFC_DATA] +
3635 arc_mfu->arcs_lsize[ARC_BUFC_METADATA];
3636 uint64_t ghost_clean =
3637 arc_mru_ghost->arcs_lsize[ARC_BUFC_DATA] +
3638 arc_mru_ghost->arcs_lsize[ARC_BUFC_METADATA] +
3639 arc_mfu_ghost->arcs_lsize[ARC_BUFC_DATA] +
3640 arc_mfu_ghost->arcs_lsize[ARC_BUFC_METADATA];
3641 uint64_t arc_dirty = MAX((int64_t)arc_size - (int64_t)arc_clean, 0);
3642
3643 if (arc_dirty >= arc_c_min)
3644 return (ghost_clean + arc_clean);
3645
3646 return (ghost_clean + MAX((int64_t)arc_size - (int64_t)arc_c_min, 0));
3647}
3648
ed6e9cc2
TC
3649/*
3650 * If sc->nr_to_scan is zero, the caller is requesting a query of the
3651 * number of objects which can potentially be freed. If it is nonzero,
3652 * the request is to free that many objects.
3653 *
3654 * Linux kernels >= 3.12 have the count_objects and scan_objects callbacks
3655 * in struct shrinker and also require the shrinker to return the number
3656 * of objects freed.
3657 *
3658 * Older kernels require the shrinker to return the number of freeable
3659 * objects following the freeing of nr_to_free.
3660 */
3661static spl_shrinker_t
7e7baeca 3662__arc_shrinker_func(struct shrinker *shrink, struct shrink_control *sc)
7cb67b45 3663{
ed6e9cc2 3664 int64_t pages;
7cb67b45 3665
302f753f
BB
3666 /* The arc is considered warm once reclaim has occurred */
3667 if (unlikely(arc_warm == B_FALSE))
3668 arc_warm = B_TRUE;
7cb67b45 3669
302f753f 3670 /* Return the potential number of reclaimable pages */
ed6e9cc2 3671 pages = btop((int64_t)arc_evictable_memory());
302f753f
BB
3672 if (sc->nr_to_scan == 0)
3673 return (pages);
3fd70ee6
BB
3674
3675 /* Not allowed to perform filesystem reclaim */
7e7baeca 3676 if (!(sc->gfp_mask & __GFP_FS))
ed6e9cc2 3677 return (SHRINK_STOP);
3fd70ee6 3678
7cb67b45 3679 /* Reclaim in progress */
ca0bf58d 3680 if (mutex_tryenter(&arc_reclaim_lock) == 0)
ed6e9cc2 3681 return (SHRINK_STOP);
7cb67b45 3682
ca0bf58d
PS
3683 mutex_exit(&arc_reclaim_lock);
3684
302f753f
BB
3685 /*
3686 * Evict the requested number of pages by shrinking arc_c the
3687 * requested amount. If there is nothing left to evict just
3688 * reap whatever we can from the various arc slabs.
3689 */
3690 if (pages > 0) {
ca67b33a
MA
3691 arc_shrink(ptob(sc->nr_to_scan));
3692 arc_kmem_reap_now();
ed6e9cc2
TC
3693#ifdef HAVE_SPLIT_SHRINKER_CALLBACK
3694 pages = MAX(pages - btop(arc_evictable_memory()), 0);
3695#else
1e3cb67b 3696 pages = btop(arc_evictable_memory());
ed6e9cc2 3697#endif
302f753f 3698 } else {
ca67b33a 3699 arc_kmem_reap_now();
ed6e9cc2 3700 pages = SHRINK_STOP;
302f753f
BB
3701 }
3702
ca0bf58d
PS
3703 /*
3704 * We've reaped what we can, wake up threads.
3705 */
3706 cv_broadcast(&arc_reclaim_waiters_cv);
3707
302f753f
BB
3708 /*
3709 * When direct reclaim is observed it usually indicates a rapid
3710 * increase in memory pressure. This occurs because the kswapd
3711 * threads were unable to asynchronously keep enough free memory
3712 * available. In this case set arc_no_grow to briefly pause arc
3713 * growth to avoid compounding the memory pressure.
3714 */
7cb67b45 3715 if (current_is_kswapd()) {
302f753f 3716 ARCSTAT_BUMP(arcstat_memory_indirect_count);
7cb67b45 3717 } else {
302f753f 3718 arc_no_grow = B_TRUE;
bce45ec9 3719 arc_grow_time = ddi_get_lbolt() + (zfs_arc_grow_retry * hz);
302f753f 3720 ARCSTAT_BUMP(arcstat_memory_direct_count);
7cb67b45
BB
3721 }
3722
1e3cb67b 3723 return (pages);
7cb67b45 3724}
7e7baeca 3725SPL_SHRINKER_CALLBACK_WRAPPER(arc_shrinker_func);
7cb67b45
BB
3726
3727SPL_SHRINKER_DECLARE(arc_shrinker, arc_shrinker_func, DEFAULT_SEEKS);
3728#endif /* _KERNEL */
3729
34dc7c2f
BB
3730/*
3731 * Adapt arc info given the number of bytes we are trying to add and
3732 * the state that we are comming from. This function is only called
3733 * when we are adding new content to the cache.
3734 */
3735static void
3736arc_adapt(int bytes, arc_state_t *state)
3737{
3738 int mult;
728d6ae9 3739 uint64_t arc_p_min = (arc_c >> arc_p_min_shift);
36da08ef
PS
3740 int64_t mrug_size = refcount_count(&arc_mru_ghost->arcs_size);
3741 int64_t mfug_size = refcount_count(&arc_mfu_ghost->arcs_size);
34dc7c2f
BB
3742
3743 if (state == arc_l2c_only)
3744 return;
3745
3746 ASSERT(bytes > 0);
3747 /*
3748 * Adapt the target size of the MRU list:
3749 * - if we just hit in the MRU ghost list, then increase
3750 * the target size of the MRU list.
3751 * - if we just hit in the MFU ghost list, then increase
3752 * the target size of the MFU list by decreasing the
3753 * target size of the MRU list.
3754 */
3755 if (state == arc_mru_ghost) {
36da08ef 3756 mult = (mrug_size >= mfug_size) ? 1 : (mfug_size / mrug_size);
62422785
PS
3757 if (!zfs_arc_p_dampener_disable)
3758 mult = MIN(mult, 10); /* avoid wild arc_p adjustment */
34dc7c2f 3759
728d6ae9 3760 arc_p = MIN(arc_c - arc_p_min, arc_p + bytes * mult);
34dc7c2f 3761 } else if (state == arc_mfu_ghost) {
d164b209
BB
3762 uint64_t delta;
3763
36da08ef 3764 mult = (mfug_size >= mrug_size) ? 1 : (mrug_size / mfug_size);
62422785
PS
3765 if (!zfs_arc_p_dampener_disable)
3766 mult = MIN(mult, 10);
34dc7c2f 3767
d164b209 3768 delta = MIN(bytes * mult, arc_p);
728d6ae9 3769 arc_p = MAX(arc_p_min, arc_p - delta);
34dc7c2f
BB
3770 }
3771 ASSERT((int64_t)arc_p >= 0);
3772
ca67b33a
MA
3773 if (arc_reclaim_needed()) {
3774 cv_signal(&arc_reclaim_thread_cv);
3775 return;
3776 }
3777
34dc7c2f
BB
3778 if (arc_no_grow)
3779 return;
3780
3781 if (arc_c >= arc_c_max)
3782 return;
3783
3784 /*
3785 * If we're within (2 * maxblocksize) bytes of the target
3786 * cache size, increment the target cache size
3787 */
121b3cae
TC
3788 VERIFY3U(arc_c, >=, 2ULL << SPA_MAXBLOCKSHIFT);
3789 if (arc_size >= arc_c - (2ULL << SPA_MAXBLOCKSHIFT)) {
34dc7c2f
BB
3790 atomic_add_64(&arc_c, (int64_t)bytes);
3791 if (arc_c > arc_c_max)
3792 arc_c = arc_c_max;
3793 else if (state == arc_anon)
3794 atomic_add_64(&arc_p, (int64_t)bytes);
3795 if (arc_p > arc_c)
3796 arc_p = arc_c;
3797 }
3798 ASSERT((int64_t)arc_p >= 0);
3799}
3800
3801/*
ca0bf58d
PS
3802 * Check if arc_size has grown past our upper threshold, determined by
3803 * zfs_arc_overflow_shift.
34dc7c2f 3804 */
ca0bf58d
PS
3805static boolean_t
3806arc_is_overflowing(void)
34dc7c2f 3807{
ca0bf58d
PS
3808 /* Always allow at least one block of overflow */
3809 uint64_t overflow = MAX(SPA_MAXBLOCKSIZE,
3810 arc_c >> zfs_arc_overflow_shift);
34dc7c2f 3811
ca0bf58d 3812 return (arc_size >= arc_c + overflow);
34dc7c2f
BB
3813}
3814
3815/*
ca0bf58d
PS
3816 * The buffer, supplied as the first argument, needs a data block. If we
3817 * are hitting the hard limit for the cache size, we must sleep, waiting
3818 * for the eviction thread to catch up. If we're past the target size
3819 * but below the hard limit, we'll only signal the reclaim thread and
3820 * continue on.
34dc7c2f
BB
3821 */
3822static void
3823arc_get_data_buf(arc_buf_t *buf)
3824{
b9541d6b 3825 arc_state_t *state = buf->b_hdr->b_l1hdr.b_state;
34dc7c2f 3826 uint64_t size = buf->b_hdr->b_size;
b9541d6b 3827 arc_buf_contents_t type = arc_buf_type(buf->b_hdr);
34dc7c2f
BB
3828
3829 arc_adapt(size, state);
3830
3831 /*
ca0bf58d
PS
3832 * If arc_size is currently overflowing, and has grown past our
3833 * upper limit, we must be adding data faster than the evict
3834 * thread can evict. Thus, to ensure we don't compound the
3835 * problem by adding more data and forcing arc_size to grow even
3836 * further past it's target size, we halt and wait for the
3837 * eviction thread to catch up.
3838 *
3839 * It's also possible that the reclaim thread is unable to evict
3840 * enough buffers to get arc_size below the overflow limit (e.g.
3841 * due to buffers being un-evictable, or hash lock collisions).
3842 * In this case, we want to proceed regardless if we're
3843 * overflowing; thus we don't use a while loop here.
34dc7c2f 3844 */
ca0bf58d
PS
3845 if (arc_is_overflowing()) {
3846 mutex_enter(&arc_reclaim_lock);
3847
3848 /*
3849 * Now that we've acquired the lock, we may no longer be
3850 * over the overflow limit, lets check.
3851 *
3852 * We're ignoring the case of spurious wake ups. If that
3853 * were to happen, it'd let this thread consume an ARC
3854 * buffer before it should have (i.e. before we're under
3855 * the overflow limit and were signalled by the reclaim
3856 * thread). As long as that is a rare occurrence, it
3857 * shouldn't cause any harm.
3858 */
3859 if (arc_is_overflowing()) {
3860 cv_signal(&arc_reclaim_thread_cv);
3861 cv_wait(&arc_reclaim_waiters_cv, &arc_reclaim_lock);
34dc7c2f 3862 }
34dc7c2f 3863
ca0bf58d 3864 mutex_exit(&arc_reclaim_lock);
34dc7c2f 3865 }
ab26409d 3866
da8ccd0e 3867 if (type == ARC_BUFC_METADATA) {
ca0bf58d
PS
3868 buf->b_data = zio_buf_alloc(size);
3869 arc_space_consume(size, ARC_SPACE_META);
3870 } else {
3871 ASSERT(type == ARC_BUFC_DATA);
3872 buf->b_data = zio_data_buf_alloc(size);
3873 arc_space_consume(size, ARC_SPACE_DATA);
da8ccd0e
PS
3874 }
3875
34dc7c2f
BB
3876 /*
3877 * Update the state size. Note that ghost states have a
3878 * "ghost size" and so don't need to be updated.
3879 */
b9541d6b 3880 if (!GHOST_STATE(buf->b_hdr->b_l1hdr.b_state)) {
34dc7c2f 3881 arc_buf_hdr_t *hdr = buf->b_hdr;
36da08ef 3882 arc_state_t *state = hdr->b_l1hdr.b_state;
34dc7c2f 3883
36da08ef 3884 (void) refcount_add_many(&state->arcs_size, size, buf);
ca0bf58d
PS
3885
3886 /*
3887 * If this is reached via arc_read, the link is
3888 * protected by the hash lock. If reached via
3889 * arc_buf_alloc, the header should not be accessed by
3890 * any other thread. And, if reached via arc_read_done,
3891 * the hash lock will protect it if it's found in the
3892 * hash table; otherwise no other thread should be
3893 * trying to [add|remove]_reference it.
3894 */
3895 if (multilist_link_active(&hdr->b_l1hdr.b_arc_node)) {
b9541d6b
CW
3896 ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
3897 atomic_add_64(&hdr->b_l1hdr.b_state->arcs_lsize[type],
3898 size);
34dc7c2f
BB
3899 }
3900 /*
3901 * If we are growing the cache, and we are adding anonymous
3902 * data, and we have outgrown arc_p, update arc_p
3903 */
ca0bf58d 3904 if (arc_size < arc_c && hdr->b_l1hdr.b_state == arc_anon &&
36da08ef
PS
3905 (refcount_count(&arc_anon->arcs_size) +
3906 refcount_count(&arc_mru->arcs_size) > arc_p))
34dc7c2f
BB
3907 arc_p = MIN(arc_c, arc_p + size);
3908 }
3909}
3910
3911/*
3912 * This routine is called whenever a buffer is accessed.
3913 * NOTE: the hash lock is dropped in this function.
3914 */
3915static void
2a432414 3916arc_access(arc_buf_hdr_t *hdr, kmutex_t *hash_lock)
34dc7c2f 3917{
428870ff
BB
3918 clock_t now;
3919
34dc7c2f 3920 ASSERT(MUTEX_HELD(hash_lock));
b9541d6b 3921 ASSERT(HDR_HAS_L1HDR(hdr));
34dc7c2f 3922
b9541d6b 3923 if (hdr->b_l1hdr.b_state == arc_anon) {
34dc7c2f
BB
3924 /*
3925 * This buffer is not in the cache, and does not
3926 * appear in our "ghost" list. Add the new buffer
3927 * to the MRU state.
3928 */
3929
b9541d6b
CW
3930 ASSERT0(hdr->b_l1hdr.b_arc_access);
3931 hdr->b_l1hdr.b_arc_access = ddi_get_lbolt();
2a432414
GW
3932 DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, hdr);
3933 arc_change_state(arc_mru, hdr, hash_lock);
34dc7c2f 3934
b9541d6b 3935 } else if (hdr->b_l1hdr.b_state == arc_mru) {
428870ff
BB
3936 now = ddi_get_lbolt();
3937
34dc7c2f
BB
3938 /*
3939 * If this buffer is here because of a prefetch, then either:
3940 * - clear the flag if this is a "referencing" read
3941 * (any subsequent access will bump this into the MFU state).
3942 * or
3943 * - move the buffer to the head of the list if this is
3944 * another prefetch (to make it less likely to be evicted).
3945 */
b9541d6b
CW
3946 if (HDR_PREFETCH(hdr)) {
3947 if (refcount_count(&hdr->b_l1hdr.b_refcnt) == 0) {
ca0bf58d
PS
3948 /* link protected by hash lock */
3949 ASSERT(multilist_link_active(
b9541d6b 3950 &hdr->b_l1hdr.b_arc_node));
34dc7c2f 3951 } else {
2a432414 3952 hdr->b_flags &= ~ARC_FLAG_PREFETCH;
b9541d6b 3953 atomic_inc_32(&hdr->b_l1hdr.b_mru_hits);
34dc7c2f
BB
3954 ARCSTAT_BUMP(arcstat_mru_hits);
3955 }
b9541d6b 3956 hdr->b_l1hdr.b_arc_access = now;
34dc7c2f
BB
3957 return;
3958 }
3959
3960 /*
3961 * This buffer has been "accessed" only once so far,
3962 * but it is still in the cache. Move it to the MFU
3963 * state.
3964 */
b9541d6b
CW
3965 if (ddi_time_after(now, hdr->b_l1hdr.b_arc_access +
3966 ARC_MINTIME)) {
34dc7c2f
BB
3967 /*
3968 * More than 125ms have passed since we
3969 * instantiated this buffer. Move it to the
3970 * most frequently used state.
3971 */
b9541d6b 3972 hdr->b_l1hdr.b_arc_access = now;
2a432414
GW
3973 DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr);
3974 arc_change_state(arc_mfu, hdr, hash_lock);
34dc7c2f 3975 }
b9541d6b 3976 atomic_inc_32(&hdr->b_l1hdr.b_mru_hits);
34dc7c2f 3977 ARCSTAT_BUMP(arcstat_mru_hits);
b9541d6b 3978 } else if (hdr->b_l1hdr.b_state == arc_mru_ghost) {
34dc7c2f
BB
3979 arc_state_t *new_state;
3980 /*
3981 * This buffer has been "accessed" recently, but
3982 * was evicted from the cache. Move it to the
3983 * MFU state.
3984 */
3985
b9541d6b 3986 if (HDR_PREFETCH(hdr)) {
34dc7c2f 3987 new_state = arc_mru;
b9541d6b 3988 if (refcount_count(&hdr->b_l1hdr.b_refcnt) > 0)
2a432414
GW
3989 hdr->b_flags &= ~ARC_FLAG_PREFETCH;
3990 DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, hdr);
34dc7c2f
BB
3991 } else {
3992 new_state = arc_mfu;
2a432414 3993 DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr);
34dc7c2f
BB
3994 }
3995
b9541d6b 3996 hdr->b_l1hdr.b_arc_access = ddi_get_lbolt();
2a432414 3997 arc_change_state(new_state, hdr, hash_lock);
34dc7c2f 3998
b9541d6b 3999 atomic_inc_32(&hdr->b_l1hdr.b_mru_ghost_hits);
34dc7c2f 4000 ARCSTAT_BUMP(arcstat_mru_ghost_hits);
b9541d6b 4001 } else if (hdr->b_l1hdr.b_state == arc_mfu) {
34dc7c2f
BB
4002 /*
4003 * This buffer has been accessed more than once and is
4004 * still in the cache. Keep it in the MFU state.
4005 *
4006 * NOTE: an add_reference() that occurred when we did
4007 * the arc_read() will have kicked this off the list.
4008 * If it was a prefetch, we will explicitly move it to
4009 * the head of the list now.
4010 */
b9541d6b
CW
4011 if ((HDR_PREFETCH(hdr)) != 0) {
4012 ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
ca0bf58d
PS
4013 /* link protected by hash_lock */
4014 ASSERT(multilist_link_active(&hdr->b_l1hdr.b_arc_node));
34dc7c2f 4015 }
b9541d6b 4016 atomic_inc_32(&hdr->b_l1hdr.b_mfu_hits);
34dc7c2f 4017 ARCSTAT_BUMP(arcstat_mfu_hits);
b9541d6b
CW
4018 hdr->b_l1hdr.b_arc_access = ddi_get_lbolt();
4019 } else if (hdr->b_l1hdr.b_state == arc_mfu_ghost) {
34dc7c2f
BB
4020 arc_state_t *new_state = arc_mfu;
4021 /*
4022 * This buffer has been accessed more than once but has
4023 * been evicted from the cache. Move it back to the
4024 * MFU state.
4025 */
4026
b9541d6b 4027 if (HDR_PREFETCH(hdr)) {
34dc7c2f
BB
4028 /*
4029 * This is a prefetch access...
4030 * move this block back to the MRU state.
4031 */
b9541d6b 4032 ASSERT0(refcount_count(&hdr->b_l1hdr.b_refcnt));
34dc7c2f
BB
4033 new_state = arc_mru;
4034 }
4035
b9541d6b 4036 hdr->b_l1hdr.b_arc_access = ddi_get_lbolt();
2a432414
GW
4037 DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr);
4038 arc_change_state(new_state, hdr, hash_lock);
34dc7c2f 4039
b9541d6b 4040 atomic_inc_32(&hdr->b_l1hdr.b_mfu_ghost_hits);
34dc7c2f 4041 ARCSTAT_BUMP(arcstat_mfu_ghost_hits);
b9541d6b 4042 } else if (hdr->b_l1hdr.b_state == arc_l2c_only) {
34dc7c2f
BB
4043 /*
4044 * This buffer is on the 2nd Level ARC.
4045 */
4046
b9541d6b 4047 hdr->b_l1hdr.b_arc_access = ddi_get_lbolt();
2a432414
GW
4048 DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr);
4049 arc_change_state(arc_mfu, hdr, hash_lock);
34dc7c2f 4050 } else {
b9541d6b
CW
4051 cmn_err(CE_PANIC, "invalid arc state 0x%p",
4052 hdr->b_l1hdr.b_state);
34dc7c2f
BB
4053 }
4054}
4055
4056/* a generic arc_done_func_t which you can use */
4057/* ARGSUSED */
4058void
4059arc_bcopy_func(zio_t *zio, arc_buf_t *buf, void *arg)
4060{
428870ff
BB
4061 if (zio == NULL || zio->io_error == 0)
4062 bcopy(buf->b_data, arg, buf->b_hdr->b_size);
13fe0198 4063 VERIFY(arc_buf_remove_ref(buf, arg));
34dc7c2f
BB
4064}
4065
4066/* a generic arc_done_func_t */
4067void
4068arc_getbuf_func(zio_t *zio, arc_buf_t *buf, void *arg)
4069{
4070 arc_buf_t **bufp = arg;
4071 if (zio && zio->io_error) {
13fe0198 4072 VERIFY(arc_buf_remove_ref(buf, arg));
34dc7c2f
BB
4073 *bufp = NULL;
4074 } else {
4075 *bufp = buf;
428870ff 4076 ASSERT(buf->b_data);
34dc7c2f
BB
4077 }
4078}
4079
4080static void
4081arc_read_done(zio_t *zio)
4082{
9b67f605 4083 arc_buf_hdr_t *hdr;
34dc7c2f
BB
4084 arc_buf_t *buf;
4085 arc_buf_t *abuf; /* buffer we're assigning to callback */
9b67f605 4086 kmutex_t *hash_lock = NULL;
34dc7c2f
BB
4087 arc_callback_t *callback_list, *acb;
4088 int freeable = FALSE;
4089
4090 buf = zio->io_private;
4091 hdr = buf->b_hdr;
4092
4093 /*
4094 * The hdr was inserted into hash-table and removed from lists
4095 * prior to starting I/O. We should find this header, since
4096 * it's in the hash table, and it should be legit since it's
4097 * not possible to evict it during the I/O. The only possible
4098 * reason for it not to be found is if we were freed during the
4099 * read.
4100 */
9b67f605
MA
4101 if (HDR_IN_HASH_TABLE(hdr)) {
4102 arc_buf_hdr_t *found;
4103
4104 ASSERT3U(hdr->b_birth, ==, BP_PHYSICAL_BIRTH(zio->io_bp));
4105 ASSERT3U(hdr->b_dva.dva_word[0], ==,
4106 BP_IDENTITY(zio->io_bp)->dva_word[0]);
4107 ASSERT3U(hdr->b_dva.dva_word[1], ==,
4108 BP_IDENTITY(zio->io_bp)->dva_word[1]);
4109
4110 found = buf_hash_find(hdr->b_spa, zio->io_bp,
4111 &hash_lock);
4112
4113 ASSERT((found == NULL && HDR_FREED_IN_READ(hdr) &&
4114 hash_lock == NULL) ||
4115 (found == hdr &&
4116 DVA_EQUAL(&hdr->b_dva, BP_IDENTITY(zio->io_bp))) ||
4117 (found == hdr && HDR_L2_READING(hdr)));
4118 }
34dc7c2f 4119
2a432414 4120 hdr->b_flags &= ~ARC_FLAG_L2_EVICTED;
b9541d6b 4121 if (l2arc_noprefetch && HDR_PREFETCH(hdr))
2a432414 4122 hdr->b_flags &= ~ARC_FLAG_L2CACHE;
34dc7c2f
BB
4123
4124 /* byteswap if necessary */
b9541d6b 4125 callback_list = hdr->b_l1hdr.b_acb;
34dc7c2f 4126 ASSERT(callback_list != NULL);
428870ff 4127 if (BP_SHOULD_BYTESWAP(zio->io_bp) && zio->io_error == 0) {
9ae529ec
CS
4128 dmu_object_byteswap_t bswap =
4129 DMU_OT_BYTESWAP(BP_GET_TYPE(zio->io_bp));
b01615d5
RY
4130 if (BP_GET_LEVEL(zio->io_bp) > 0)
4131 byteswap_uint64_array(buf->b_data, hdr->b_size);
4132 else
4133 dmu_ot_byteswap[bswap].ob_func(buf->b_data, hdr->b_size);
b128c09f 4134 }
34dc7c2f
BB
4135
4136 arc_cksum_compute(buf, B_FALSE);
498877ba 4137 arc_buf_watch(buf);
34dc7c2f 4138
b9541d6b
CW
4139 if (hash_lock && zio->io_error == 0 &&
4140 hdr->b_l1hdr.b_state == arc_anon) {
428870ff
BB
4141 /*
4142 * Only call arc_access on anonymous buffers. This is because
4143 * if we've issued an I/O for an evicted buffer, we've already
4144 * called arc_access (to prevent any simultaneous readers from
4145 * getting confused).
4146 */
4147 arc_access(hdr, hash_lock);
4148 }
4149
34dc7c2f
BB
4150 /* create copies of the data buffer for the callers */
4151 abuf = buf;
4152 for (acb = callback_list; acb; acb = acb->acb_next) {
4153 if (acb->acb_done) {
1eb5bfa3
GW
4154 if (abuf == NULL) {
4155 ARCSTAT_BUMP(arcstat_duplicate_reads);
34dc7c2f 4156 abuf = arc_buf_clone(buf);
1eb5bfa3 4157 }
34dc7c2f
BB
4158 acb->acb_buf = abuf;
4159 abuf = NULL;
4160 }
4161 }
b9541d6b 4162 hdr->b_l1hdr.b_acb = NULL;
2a432414 4163 hdr->b_flags &= ~ARC_FLAG_IO_IN_PROGRESS;
34dc7c2f 4164 ASSERT(!HDR_BUF_AVAILABLE(hdr));
428870ff
BB
4165 if (abuf == buf) {
4166 ASSERT(buf->b_efunc == NULL);
b9541d6b 4167 ASSERT(hdr->b_l1hdr.b_datacnt == 1);
2a432414 4168 hdr->b_flags |= ARC_FLAG_BUF_AVAILABLE;
428870ff 4169 }
34dc7c2f 4170
b9541d6b
CW
4171 ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt) ||
4172 callback_list != NULL);
34dc7c2f
BB
4173
4174 if (zio->io_error != 0) {
2a432414 4175 hdr->b_flags |= ARC_FLAG_IO_ERROR;
b9541d6b 4176 if (hdr->b_l1hdr.b_state != arc_anon)
34dc7c2f
BB
4177 arc_change_state(arc_anon, hdr, hash_lock);
4178 if (HDR_IN_HASH_TABLE(hdr))
4179 buf_hash_remove(hdr);
b9541d6b 4180 freeable = refcount_is_zero(&hdr->b_l1hdr.b_refcnt);
34dc7c2f
BB
4181 }
4182
4183 /*
4184 * Broadcast before we drop the hash_lock to avoid the possibility
4185 * that the hdr (and hence the cv) might be freed before we get to
4186 * the cv_broadcast().
4187 */
b9541d6b 4188 cv_broadcast(&hdr->b_l1hdr.b_cv);
34dc7c2f 4189
b9541d6b 4190 if (hash_lock != NULL) {
34dc7c2f
BB
4191 mutex_exit(hash_lock);
4192 } else {
4193 /*
4194 * This block was freed while we waited for the read to
4195 * complete. It has been removed from the hash table and
4196 * moved to the anonymous state (so that it won't show up
4197 * in the cache).
4198 */
b9541d6b
CW
4199 ASSERT3P(hdr->b_l1hdr.b_state, ==, arc_anon);
4200 freeable = refcount_is_zero(&hdr->b_l1hdr.b_refcnt);
34dc7c2f
BB
4201 }
4202
4203 /* execute each callback and free its structure */
4204 while ((acb = callback_list) != NULL) {
4205 if (acb->acb_done)
4206 acb->acb_done(zio, acb->acb_buf, acb->acb_private);
4207
4208 if (acb->acb_zio_dummy != NULL) {
4209 acb->acb_zio_dummy->io_error = zio->io_error;
4210 zio_nowait(acb->acb_zio_dummy);
4211 }
4212
4213 callback_list = acb->acb_next;
4214 kmem_free(acb, sizeof (arc_callback_t));
4215 }
4216
4217 if (freeable)
4218 arc_hdr_destroy(hdr);
4219}
4220
4221/*
5c839890 4222 * "Read" the block at the specified DVA (in bp) via the
34dc7c2f
BB
4223 * cache. If the block is found in the cache, invoke the provided
4224 * callback immediately and return. Note that the `zio' parameter
4225 * in the callback will be NULL in this case, since no IO was
4226 * required. If the block is not in the cache pass the read request
4227 * on to the spa with a substitute callback function, so that the
4228 * requested block will be added to the cache.
4229 *
4230 * If a read request arrives for a block that has a read in-progress,
4231 * either wait for the in-progress read to complete (and return the
4232 * results); or, if this is a read with a "done" func, add a record
4233 * to the read to invoke the "done" func when the read completes,
4234 * and return; or just return.
4235 *
4236 * arc_read_done() will invoke all the requested "done" functions
4237 * for readers of this block.
4238 */
4239int
294f6806 4240arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_done_func_t *done,
2a432414
GW
4241 void *private, zio_priority_t priority, int zio_flags,
4242 arc_flags_t *arc_flags, const zbookmark_phys_t *zb)
34dc7c2f 4243{
9b67f605 4244 arc_buf_hdr_t *hdr = NULL;
d4ed6673 4245 arc_buf_t *buf = NULL;
9b67f605 4246 kmutex_t *hash_lock = NULL;
34dc7c2f 4247 zio_t *rzio;
3541dc6d 4248 uint64_t guid = spa_load_guid(spa);
1421c891 4249 int rc = 0;
34dc7c2f 4250
9b67f605
MA
4251 ASSERT(!BP_IS_EMBEDDED(bp) ||
4252 BPE_GET_ETYPE(bp) == BP_EMBEDDED_TYPE_DATA);
4253
34dc7c2f 4254top:
9b67f605
MA
4255 if (!BP_IS_EMBEDDED(bp)) {
4256 /*
4257 * Embedded BP's have no DVA and require no I/O to "read".
4258 * Create an anonymous arc buf to back it.
4259 */
4260 hdr = buf_hash_find(guid, bp, &hash_lock);
4261 }
4262
b9541d6b 4263 if (hdr != NULL && HDR_HAS_L1HDR(hdr) && hdr->b_l1hdr.b_datacnt > 0) {
34dc7c2f 4264
2a432414 4265 *arc_flags |= ARC_FLAG_CACHED;
34dc7c2f
BB
4266
4267 if (HDR_IO_IN_PROGRESS(hdr)) {
4268
2a432414 4269 if (*arc_flags & ARC_FLAG_WAIT) {
b9541d6b 4270 cv_wait(&hdr->b_l1hdr.b_cv, hash_lock);
34dc7c2f
BB
4271 mutex_exit(hash_lock);
4272 goto top;
4273 }
2a432414 4274 ASSERT(*arc_flags & ARC_FLAG_NOWAIT);
34dc7c2f
BB
4275
4276 if (done) {
4277 arc_callback_t *acb = NULL;
4278
4279 acb = kmem_zalloc(sizeof (arc_callback_t),
79c76d5b 4280 KM_SLEEP);
34dc7c2f
BB
4281 acb->acb_done = done;
4282 acb->acb_private = private;
34dc7c2f
BB
4283 if (pio != NULL)
4284 acb->acb_zio_dummy = zio_null(pio,
d164b209 4285 spa, NULL, NULL, NULL, zio_flags);
34dc7c2f
BB
4286
4287 ASSERT(acb->acb_done != NULL);
b9541d6b
CW
4288 acb->acb_next = hdr->b_l1hdr.b_acb;
4289 hdr->b_l1hdr.b_acb = acb;
34dc7c2f
BB
4290 add_reference(hdr, hash_lock, private);
4291 mutex_exit(hash_lock);
1421c891 4292 goto out;
34dc7c2f
BB
4293 }
4294 mutex_exit(hash_lock);
1421c891 4295 goto out;
34dc7c2f
BB
4296 }
4297
b9541d6b
CW
4298 ASSERT(hdr->b_l1hdr.b_state == arc_mru ||
4299 hdr->b_l1hdr.b_state == arc_mfu);
34dc7c2f
BB
4300
4301 if (done) {
4302 add_reference(hdr, hash_lock, private);
4303 /*
4304 * If this block is already in use, create a new
4305 * copy of the data so that we will be guaranteed
4306 * that arc_release() will always succeed.
4307 */
b9541d6b 4308 buf = hdr->b_l1hdr.b_buf;
34dc7c2f
BB
4309 ASSERT(buf);
4310 ASSERT(buf->b_data);
4311 if (HDR_BUF_AVAILABLE(hdr)) {
4312 ASSERT(buf->b_efunc == NULL);
2a432414 4313 hdr->b_flags &= ~ARC_FLAG_BUF_AVAILABLE;
34dc7c2f
BB
4314 } else {
4315 buf = arc_buf_clone(buf);
4316 }
428870ff 4317
2a432414 4318 } else if (*arc_flags & ARC_FLAG_PREFETCH &&
b9541d6b 4319 refcount_count(&hdr->b_l1hdr.b_refcnt) == 0) {
2a432414 4320 hdr->b_flags |= ARC_FLAG_PREFETCH;
34dc7c2f
BB
4321 }
4322 DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr);
4323 arc_access(hdr, hash_lock);
2a432414
GW
4324 if (*arc_flags & ARC_FLAG_L2CACHE)
4325 hdr->b_flags |= ARC_FLAG_L2CACHE;
4326 if (*arc_flags & ARC_FLAG_L2COMPRESS)
4327 hdr->b_flags |= ARC_FLAG_L2COMPRESS;
34dc7c2f
BB
4328 mutex_exit(hash_lock);
4329 ARCSTAT_BUMP(arcstat_hits);
b9541d6b
CW
4330 ARCSTAT_CONDSTAT(!HDR_PREFETCH(hdr),
4331 demand, prefetch, !HDR_ISTYPE_METADATA(hdr),
34dc7c2f
BB
4332 data, metadata, hits);
4333
4334 if (done)
4335 done(NULL, buf, private);
4336 } else {
4337 uint64_t size = BP_GET_LSIZE(bp);
9b67f605 4338 arc_callback_t *acb;
b128c09f 4339 vdev_t *vd = NULL;
a117a6d6 4340 uint64_t addr = 0;
d164b209 4341 boolean_t devw = B_FALSE;
0ed212dc 4342 enum zio_compress b_compress = ZIO_COMPRESS_OFF;
b9541d6b 4343 int32_t b_asize = 0;
34dc7c2f 4344
5f6d0b6f
BB
4345 /*
4346 * Gracefully handle a damaged logical block size as a
4347 * checksum error by passing a dummy zio to the done callback.
4348 */
f1512ee6 4349 if (size > spa_maxblocksize(spa)) {
5f6d0b6f
BB
4350 if (done) {
4351 rzio = zio_null(pio, spa, NULL,
4352 NULL, NULL, zio_flags);
4353 rzio->io_error = ECKSUM;
4354 done(rzio, buf, private);
4355 zio_nowait(rzio);
4356 }
4357 rc = ECKSUM;
4358 goto out;
4359 }
4360
34dc7c2f
BB
4361 if (hdr == NULL) {
4362 /* this block is not in the cache */
9b67f605 4363 arc_buf_hdr_t *exists = NULL;
34dc7c2f
BB
4364 arc_buf_contents_t type = BP_GET_BUFC_TYPE(bp);
4365 buf = arc_buf_alloc(spa, size, private, type);
4366 hdr = buf->b_hdr;
9b67f605
MA
4367 if (!BP_IS_EMBEDDED(bp)) {
4368 hdr->b_dva = *BP_IDENTITY(bp);
4369 hdr->b_birth = BP_PHYSICAL_BIRTH(bp);
9b67f605
MA
4370 exists = buf_hash_insert(hdr, &hash_lock);
4371 }
4372 if (exists != NULL) {
34dc7c2f
BB
4373 /* somebody beat us to the hash insert */
4374 mutex_exit(hash_lock);
428870ff 4375 buf_discard_identity(hdr);
34dc7c2f
BB
4376 (void) arc_buf_remove_ref(buf, private);
4377 goto top; /* restart the IO request */
4378 }
2a432414 4379
34dc7c2f 4380 /* if this is a prefetch, we don't have a reference */
2a432414 4381 if (*arc_flags & ARC_FLAG_PREFETCH) {
34dc7c2f
BB
4382 (void) remove_reference(hdr, hash_lock,
4383 private);
2a432414 4384 hdr->b_flags |= ARC_FLAG_PREFETCH;
34dc7c2f 4385 }
2a432414
GW
4386 if (*arc_flags & ARC_FLAG_L2CACHE)
4387 hdr->b_flags |= ARC_FLAG_L2CACHE;
4388 if (*arc_flags & ARC_FLAG_L2COMPRESS)
4389 hdr->b_flags |= ARC_FLAG_L2COMPRESS;
34dc7c2f 4390 if (BP_GET_LEVEL(bp) > 0)
2a432414 4391 hdr->b_flags |= ARC_FLAG_INDIRECT;
34dc7c2f 4392 } else {
b9541d6b
CW
4393 /*
4394 * This block is in the ghost cache. If it was L2-only
4395 * (and thus didn't have an L1 hdr), we realloc the
4396 * header to add an L1 hdr.
4397 */
4398 if (!HDR_HAS_L1HDR(hdr)) {
4399 hdr = arc_hdr_realloc(hdr, hdr_l2only_cache,
4400 hdr_full_cache);
4401 }
4402
4403 ASSERT(GHOST_STATE(hdr->b_l1hdr.b_state));
34dc7c2f 4404 ASSERT(!HDR_IO_IN_PROGRESS(hdr));
b9541d6b 4405 ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
ca0bf58d 4406 ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL);
34dc7c2f
BB
4407
4408 /* if this is a prefetch, we don't have a reference */
2a432414
GW
4409 if (*arc_flags & ARC_FLAG_PREFETCH)
4410 hdr->b_flags |= ARC_FLAG_PREFETCH;
34dc7c2f
BB
4411 else
4412 add_reference(hdr, hash_lock, private);
2a432414
GW
4413 if (*arc_flags & ARC_FLAG_L2CACHE)
4414 hdr->b_flags |= ARC_FLAG_L2CACHE;
4415 if (*arc_flags & ARC_FLAG_L2COMPRESS)
4416 hdr->b_flags |= ARC_FLAG_L2COMPRESS;
34dc7c2f
BB
4417 buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE);
4418 buf->b_hdr = hdr;
4419 buf->b_data = NULL;
4420 buf->b_efunc = NULL;
4421 buf->b_private = NULL;
4422 buf->b_next = NULL;
b9541d6b
CW
4423 hdr->b_l1hdr.b_buf = buf;
4424 ASSERT0(hdr->b_l1hdr.b_datacnt);
4425 hdr->b_l1hdr.b_datacnt = 1;
428870ff
BB
4426 arc_get_data_buf(buf);
4427 arc_access(hdr, hash_lock);
34dc7c2f
BB
4428 }
4429
b9541d6b 4430 ASSERT(!GHOST_STATE(hdr->b_l1hdr.b_state));
428870ff 4431
79c76d5b 4432 acb = kmem_zalloc(sizeof (arc_callback_t), KM_SLEEP);
34dc7c2f
BB
4433 acb->acb_done = done;
4434 acb->acb_private = private;
34dc7c2f 4435
b9541d6b
CW
4436 ASSERT(hdr->b_l1hdr.b_acb == NULL);
4437 hdr->b_l1hdr.b_acb = acb;
2a432414 4438 hdr->b_flags |= ARC_FLAG_IO_IN_PROGRESS;
34dc7c2f 4439
b9541d6b
CW
4440 if (HDR_HAS_L2HDR(hdr) &&
4441 (vd = hdr->b_l2hdr.b_dev->l2ad_vdev) != NULL) {
4442 devw = hdr->b_l2hdr.b_dev->l2ad_writing;
4443 addr = hdr->b_l2hdr.b_daddr;
4444 b_compress = HDR_GET_COMPRESS(hdr);
4445 b_asize = hdr->b_l2hdr.b_asize;
b128c09f
BB
4446 /*
4447 * Lock out device removal.
4448 */
4449 if (vdev_is_dead(vd) ||
4450 !spa_config_tryenter(spa, SCL_L2ARC, vd, RW_READER))
4451 vd = NULL;
4452 }
4453
9b67f605
MA
4454 if (hash_lock != NULL)
4455 mutex_exit(hash_lock);
b128c09f 4456
e49f1e20
WA
4457 /*
4458 * At this point, we have a level 1 cache miss. Try again in
4459 * L2ARC if possible.
4460 */
34dc7c2f 4461 ASSERT3U(hdr->b_size, ==, size);
428870ff 4462 DTRACE_PROBE4(arc__miss, arc_buf_hdr_t *, hdr, blkptr_t *, bp,
5dbd68a3 4463 uint64_t, size, zbookmark_phys_t *, zb);
34dc7c2f 4464 ARCSTAT_BUMP(arcstat_misses);
b9541d6b
CW
4465 ARCSTAT_CONDSTAT(!HDR_PREFETCH(hdr),
4466 demand, prefetch, !HDR_ISTYPE_METADATA(hdr),
34dc7c2f
BB
4467 data, metadata, misses);
4468
d164b209 4469 if (vd != NULL && l2arc_ndev != 0 && !(l2arc_norw && devw)) {
34dc7c2f
BB
4470 /*
4471 * Read from the L2ARC if the following are true:
b128c09f
BB
4472 * 1. The L2ARC vdev was previously cached.
4473 * 2. This buffer still has L2ARC metadata.
4474 * 3. This buffer isn't currently writing to the L2ARC.
4475 * 4. The L2ARC entry wasn't evicted, which may
4476 * also have invalidated the vdev.
d164b209 4477 * 5. This isn't prefetch and l2arc_noprefetch is set.
34dc7c2f 4478 */
b9541d6b 4479 if (HDR_HAS_L2HDR(hdr) &&
d164b209
BB
4480 !HDR_L2_WRITING(hdr) && !HDR_L2_EVICTED(hdr) &&
4481 !(l2arc_noprefetch && HDR_PREFETCH(hdr))) {
34dc7c2f
BB
4482 l2arc_read_callback_t *cb;
4483
4484 DTRACE_PROBE1(l2arc__hit, arc_buf_hdr_t *, hdr);
4485 ARCSTAT_BUMP(arcstat_l2_hits);
b9541d6b 4486 atomic_inc_32(&hdr->b_l2hdr.b_hits);
34dc7c2f 4487
34dc7c2f 4488 cb = kmem_zalloc(sizeof (l2arc_read_callback_t),
79c76d5b 4489 KM_SLEEP);
34dc7c2f
BB
4490 cb->l2rcb_buf = buf;
4491 cb->l2rcb_spa = spa;
4492 cb->l2rcb_bp = *bp;
4493 cb->l2rcb_zb = *zb;
b128c09f 4494 cb->l2rcb_flags = zio_flags;
0ed212dc 4495 cb->l2rcb_compress = b_compress;
34dc7c2f 4496
a117a6d6
GW
4497 ASSERT(addr >= VDEV_LABEL_START_SIZE &&
4498 addr + size < vd->vdev_psize -
4499 VDEV_LABEL_END_SIZE);
4500
34dc7c2f 4501 /*
b128c09f
BB
4502 * l2arc read. The SCL_L2ARC lock will be
4503 * released by l2arc_read_done().
3a17a7a9
SK
4504 * Issue a null zio if the underlying buffer
4505 * was squashed to zero size by compression.
34dc7c2f 4506 */
0ed212dc 4507 if (b_compress == ZIO_COMPRESS_EMPTY) {
3a17a7a9
SK
4508 rzio = zio_null(pio, spa, vd,
4509 l2arc_read_done, cb,
4510 zio_flags | ZIO_FLAG_DONT_CACHE |
4511 ZIO_FLAG_CANFAIL |
4512 ZIO_FLAG_DONT_PROPAGATE |
4513 ZIO_FLAG_DONT_RETRY);
4514 } else {
4515 rzio = zio_read_phys(pio, vd, addr,
0ed212dc
BP
4516 b_asize, buf->b_data,
4517 ZIO_CHECKSUM_OFF,
3a17a7a9
SK
4518 l2arc_read_done, cb, priority,
4519 zio_flags | ZIO_FLAG_DONT_CACHE |
4520 ZIO_FLAG_CANFAIL |
4521 ZIO_FLAG_DONT_PROPAGATE |
4522 ZIO_FLAG_DONT_RETRY, B_FALSE);
4523 }
34dc7c2f
BB
4524 DTRACE_PROBE2(l2arc__read, vdev_t *, vd,
4525 zio_t *, rzio);
0ed212dc 4526 ARCSTAT_INCR(arcstat_l2_read_bytes, b_asize);
34dc7c2f 4527
2a432414 4528 if (*arc_flags & ARC_FLAG_NOWAIT) {
b128c09f 4529 zio_nowait(rzio);
1421c891 4530 goto out;
b128c09f 4531 }
34dc7c2f 4532
2a432414 4533 ASSERT(*arc_flags & ARC_FLAG_WAIT);
b128c09f 4534 if (zio_wait(rzio) == 0)
1421c891 4535 goto out;
b128c09f
BB
4536
4537 /* l2arc read error; goto zio_read() */
34dc7c2f
BB
4538 } else {
4539 DTRACE_PROBE1(l2arc__miss,
4540 arc_buf_hdr_t *, hdr);
4541 ARCSTAT_BUMP(arcstat_l2_misses);
4542 if (HDR_L2_WRITING(hdr))
4543 ARCSTAT_BUMP(arcstat_l2_rw_clash);
b128c09f 4544 spa_config_exit(spa, SCL_L2ARC, vd);
34dc7c2f 4545 }
d164b209
BB
4546 } else {
4547 if (vd != NULL)
4548 spa_config_exit(spa, SCL_L2ARC, vd);
4549 if (l2arc_ndev != 0) {
4550 DTRACE_PROBE1(l2arc__miss,
4551 arc_buf_hdr_t *, hdr);
4552 ARCSTAT_BUMP(arcstat_l2_misses);
4553 }
34dc7c2f 4554 }
34dc7c2f
BB
4555
4556 rzio = zio_read(pio, spa, bp, buf->b_data, size,
b128c09f 4557 arc_read_done, buf, priority, zio_flags, zb);
34dc7c2f 4558
2a432414 4559 if (*arc_flags & ARC_FLAG_WAIT) {
1421c891
PS
4560 rc = zio_wait(rzio);
4561 goto out;
4562 }
34dc7c2f 4563
2a432414 4564 ASSERT(*arc_flags & ARC_FLAG_NOWAIT);
34dc7c2f
BB
4565 zio_nowait(rzio);
4566 }
1421c891
PS
4567
4568out:
4569 spa_read_history_add(spa, zb, *arc_flags);
4570 return (rc);
34dc7c2f
BB
4571}
4572
ab26409d
BB
4573arc_prune_t *
4574arc_add_prune_callback(arc_prune_func_t *func, void *private)
4575{
4576 arc_prune_t *p;
4577
d1d7e268 4578 p = kmem_alloc(sizeof (*p), KM_SLEEP);
ab26409d
BB
4579 p->p_pfunc = func;
4580 p->p_private = private;
4581 list_link_init(&p->p_node);
4582 refcount_create(&p->p_refcnt);
4583
4584 mutex_enter(&arc_prune_mtx);
4585 refcount_add(&p->p_refcnt, &arc_prune_list);
4586 list_insert_head(&arc_prune_list, p);
4587 mutex_exit(&arc_prune_mtx);
4588
4589 return (p);
4590}
4591
4592void
4593arc_remove_prune_callback(arc_prune_t *p)
4594{
4595 mutex_enter(&arc_prune_mtx);
4596 list_remove(&arc_prune_list, p);
4597 if (refcount_remove(&p->p_refcnt, &arc_prune_list) == 0) {
4598 refcount_destroy(&p->p_refcnt);
4599 kmem_free(p, sizeof (*p));
4600 }
4601 mutex_exit(&arc_prune_mtx);
4602}
4603
34dc7c2f
BB
4604void
4605arc_set_callback(arc_buf_t *buf, arc_evict_func_t *func, void *private)
4606{
4607 ASSERT(buf->b_hdr != NULL);
b9541d6b
CW
4608 ASSERT(buf->b_hdr->b_l1hdr.b_state != arc_anon);
4609 ASSERT(!refcount_is_zero(&buf->b_hdr->b_l1hdr.b_refcnt) ||
4610 func == NULL);
428870ff
BB
4611 ASSERT(buf->b_efunc == NULL);
4612 ASSERT(!HDR_BUF_AVAILABLE(buf->b_hdr));
4613
34dc7c2f
BB
4614 buf->b_efunc = func;
4615 buf->b_private = private;
4616}
4617
df4474f9
MA
4618/*
4619 * Notify the arc that a block was freed, and thus will never be used again.
4620 */
4621void
4622arc_freed(spa_t *spa, const blkptr_t *bp)
4623{
4624 arc_buf_hdr_t *hdr;
4625 kmutex_t *hash_lock;
4626 uint64_t guid = spa_load_guid(spa);
4627
9b67f605
MA
4628 ASSERT(!BP_IS_EMBEDDED(bp));
4629
4630 hdr = buf_hash_find(guid, bp, &hash_lock);
df4474f9
MA
4631 if (hdr == NULL)
4632 return;
4633 if (HDR_BUF_AVAILABLE(hdr)) {
b9541d6b 4634 arc_buf_t *buf = hdr->b_l1hdr.b_buf;
df4474f9 4635 add_reference(hdr, hash_lock, FTAG);
2a432414 4636 hdr->b_flags &= ~ARC_FLAG_BUF_AVAILABLE;
df4474f9
MA
4637 mutex_exit(hash_lock);
4638
4639 arc_release(buf, FTAG);
4640 (void) arc_buf_remove_ref(buf, FTAG);
4641 } else {
4642 mutex_exit(hash_lock);
4643 }
4644
4645}
4646
34dc7c2f 4647/*
bd089c54
MA
4648 * Clear the user eviction callback set by arc_set_callback(), first calling
4649 * it if it exists. Because the presence of a callback keeps an arc_buf cached
4650 * clearing the callback may result in the arc_buf being destroyed. However,
4651 * it will not result in the *last* arc_buf being destroyed, hence the data
4652 * will remain cached in the ARC. We make a copy of the arc buffer here so
4653 * that we can process the callback without holding any locks.
4654 *
4655 * It's possible that the callback is already in the process of being cleared
4656 * by another thread. In this case we can not clear the callback.
4657 *
4658 * Returns B_TRUE if the callback was successfully called and cleared.
34dc7c2f 4659 */
bd089c54
MA
4660boolean_t
4661arc_clear_callback(arc_buf_t *buf)
34dc7c2f
BB
4662{
4663 arc_buf_hdr_t *hdr;
4664 kmutex_t *hash_lock;
bd089c54
MA
4665 arc_evict_func_t *efunc = buf->b_efunc;
4666 void *private = buf->b_private;
34dc7c2f 4667
428870ff 4668 mutex_enter(&buf->b_evict_lock);
34dc7c2f
BB
4669 hdr = buf->b_hdr;
4670 if (hdr == NULL) {
4671 /*
4672 * We are in arc_do_user_evicts().
4673 */
4674 ASSERT(buf->b_data == NULL);
428870ff 4675 mutex_exit(&buf->b_evict_lock);
bd089c54 4676 return (B_FALSE);
b128c09f 4677 } else if (buf->b_data == NULL) {
34dc7c2f 4678 /*
b128c09f
BB
4679 * We are on the eviction list; process this buffer now
4680 * but let arc_do_user_evicts() do the reaping.
34dc7c2f 4681 */
b128c09f 4682 buf->b_efunc = NULL;
428870ff 4683 mutex_exit(&buf->b_evict_lock);
bd089c54
MA
4684 VERIFY0(efunc(private));
4685 return (B_TRUE);
34dc7c2f 4686 }
b128c09f
BB
4687 hash_lock = HDR_LOCK(hdr);
4688 mutex_enter(hash_lock);
428870ff
BB
4689 hdr = buf->b_hdr;
4690 ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
34dc7c2f 4691
b9541d6b
CW
4692 ASSERT3U(refcount_count(&hdr->b_l1hdr.b_refcnt), <,
4693 hdr->b_l1hdr.b_datacnt);
4694 ASSERT(hdr->b_l1hdr.b_state == arc_mru ||
4695 hdr->b_l1hdr.b_state == arc_mfu);
34dc7c2f 4696
bd089c54
MA
4697 buf->b_efunc = NULL;
4698 buf->b_private = NULL;
34dc7c2f 4699
b9541d6b 4700 if (hdr->b_l1hdr.b_datacnt > 1) {
bd089c54 4701 mutex_exit(&buf->b_evict_lock);
ca0bf58d 4702 arc_buf_destroy(buf, TRUE);
bd089c54 4703 } else {
b9541d6b 4704 ASSERT(buf == hdr->b_l1hdr.b_buf);
2a432414 4705 hdr->b_flags |= ARC_FLAG_BUF_AVAILABLE;
bd089c54 4706 mutex_exit(&buf->b_evict_lock);
34dc7c2f 4707 }
34dc7c2f 4708
bd089c54
MA
4709 mutex_exit(hash_lock);
4710 VERIFY0(efunc(private));
4711 return (B_TRUE);
34dc7c2f
BB
4712}
4713
4714/*
e49f1e20
WA
4715 * Release this buffer from the cache, making it an anonymous buffer. This
4716 * must be done after a read and prior to modifying the buffer contents.
34dc7c2f 4717 * If the buffer has more than one reference, we must make
b128c09f 4718 * a new hdr for the buffer.
34dc7c2f
BB
4719 */
4720void
4721arc_release(arc_buf_t *buf, void *tag)
4722{
b9541d6b
CW
4723 kmutex_t *hash_lock;
4724 arc_state_t *state;
4725 arc_buf_hdr_t *hdr = buf->b_hdr;
34dc7c2f 4726
428870ff 4727 /*
ca0bf58d 4728 * It would be nice to assert that if its DMU metadata (level >
428870ff
BB
4729 * 0 || it's the dnode file), then it must be syncing context.
4730 * But we don't know that information at this level.
4731 */
4732
4733 mutex_enter(&buf->b_evict_lock);
b128c09f 4734
ca0bf58d
PS
4735 ASSERT(HDR_HAS_L1HDR(hdr));
4736
b9541d6b
CW
4737 /*
4738 * We don't grab the hash lock prior to this check, because if
4739 * the buffer's header is in the arc_anon state, it won't be
4740 * linked into the hash table.
4741 */
4742 if (hdr->b_l1hdr.b_state == arc_anon) {
4743 mutex_exit(&buf->b_evict_lock);
4744 ASSERT(!HDR_IO_IN_PROGRESS(hdr));
4745 ASSERT(!HDR_IN_HASH_TABLE(hdr));
4746 ASSERT(!HDR_HAS_L2HDR(hdr));
4747 ASSERT(BUF_EMPTY(hdr));
34dc7c2f 4748
b9541d6b
CW
4749 ASSERT3U(hdr->b_l1hdr.b_datacnt, ==, 1);
4750 ASSERT3S(refcount_count(&hdr->b_l1hdr.b_refcnt), ==, 1);
4751 ASSERT(!list_link_active(&hdr->b_l1hdr.b_arc_node));
4752
4753 ASSERT3P(buf->b_efunc, ==, NULL);
4754 ASSERT3P(buf->b_private, ==, NULL);
4755
4756 hdr->b_l1hdr.b_arc_access = 0;
4757 arc_buf_thaw(buf);
4758
4759 return;
34dc7c2f
BB
4760 }
4761
b9541d6b
CW
4762 hash_lock = HDR_LOCK(hdr);
4763 mutex_enter(hash_lock);
4764
4765 /*
4766 * This assignment is only valid as long as the hash_lock is
4767 * held, we must be careful not to reference state or the
4768 * b_state field after dropping the lock.
4769 */
4770 state = hdr->b_l1hdr.b_state;
4771 ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
4772 ASSERT3P(state, !=, arc_anon);
4773
4774 /* this buffer is not on any list */
4775 ASSERT(refcount_count(&hdr->b_l1hdr.b_refcnt) > 0);
4776
4777 if (HDR_HAS_L2HDR(hdr)) {
b9541d6b 4778 mutex_enter(&hdr->b_l2hdr.b_dev->l2ad_mtx);
ca0bf58d
PS
4779
4780 /*
d962d5da
PS
4781 * We have to recheck this conditional again now that
4782 * we're holding the l2ad_mtx to prevent a race with
4783 * another thread which might be concurrently calling
4784 * l2arc_evict(). In that case, l2arc_evict() might have
4785 * destroyed the header's L2 portion as we were waiting
4786 * to acquire the l2ad_mtx.
ca0bf58d 4787 */
d962d5da
PS
4788 if (HDR_HAS_L2HDR(hdr))
4789 arc_hdr_l2hdr_destroy(hdr);
ca0bf58d 4790
b9541d6b 4791 mutex_exit(&hdr->b_l2hdr.b_dev->l2ad_mtx);
b128c09f
BB
4792 }
4793
34dc7c2f
BB
4794 /*
4795 * Do we have more than one buf?
4796 */
b9541d6b 4797 if (hdr->b_l1hdr.b_datacnt > 1) {
34dc7c2f
BB
4798 arc_buf_hdr_t *nhdr;
4799 arc_buf_t **bufp;
4800 uint64_t blksz = hdr->b_size;
d164b209 4801 uint64_t spa = hdr->b_spa;
b9541d6b 4802 arc_buf_contents_t type = arc_buf_type(hdr);
34dc7c2f
BB
4803 uint32_t flags = hdr->b_flags;
4804
b9541d6b 4805 ASSERT(hdr->b_l1hdr.b_buf != buf || buf->b_next != NULL);
34dc7c2f 4806 /*
428870ff
BB
4807 * Pull the data off of this hdr and attach it to
4808 * a new anonymous hdr.
34dc7c2f
BB
4809 */
4810 (void) remove_reference(hdr, hash_lock, tag);
b9541d6b 4811 bufp = &hdr->b_l1hdr.b_buf;
34dc7c2f
BB
4812 while (*bufp != buf)
4813 bufp = &(*bufp)->b_next;
428870ff 4814 *bufp = buf->b_next;
34dc7c2f
BB
4815 buf->b_next = NULL;
4816
b9541d6b 4817 ASSERT3P(state, !=, arc_l2c_only);
36da08ef
PS
4818
4819 (void) refcount_remove_many(
4820 &state->arcs_size, hdr->b_size, buf);
4821
b9541d6b
CW
4822 if (refcount_is_zero(&hdr->b_l1hdr.b_refcnt)) {
4823 uint64_t *size;
4824
4825 ASSERT3P(state, !=, arc_l2c_only);
4826 size = &state->arcs_lsize[type];
34dc7c2f
BB
4827 ASSERT3U(*size, >=, hdr->b_size);
4828 atomic_add_64(size, -hdr->b_size);
4829 }
1eb5bfa3
GW
4830
4831 /*
4832 * We're releasing a duplicate user data buffer, update
4833 * our statistics accordingly.
4834 */
b9541d6b 4835 if (HDR_ISTYPE_DATA(hdr)) {
1eb5bfa3
GW
4836 ARCSTAT_BUMPDOWN(arcstat_duplicate_buffers);
4837 ARCSTAT_INCR(arcstat_duplicate_buffers_size,
4838 -hdr->b_size);
4839 }
b9541d6b 4840 hdr->b_l1hdr.b_datacnt -= 1;
34dc7c2f 4841 arc_cksum_verify(buf);
498877ba 4842 arc_buf_unwatch(buf);
34dc7c2f
BB
4843
4844 mutex_exit(hash_lock);
4845
b9541d6b 4846 nhdr = kmem_cache_alloc(hdr_full_cache, KM_PUSHPAGE);
34dc7c2f
BB
4847 nhdr->b_size = blksz;
4848 nhdr->b_spa = spa;
b9541d6b
CW
4849
4850 nhdr->b_l1hdr.b_mru_hits = 0;
4851 nhdr->b_l1hdr.b_mru_ghost_hits = 0;
4852 nhdr->b_l1hdr.b_mfu_hits = 0;
4853 nhdr->b_l1hdr.b_mfu_ghost_hits = 0;
4854 nhdr->b_l1hdr.b_l2_hits = 0;
2a432414 4855 nhdr->b_flags = flags & ARC_FLAG_L2_WRITING;
b9541d6b
CW
4856 nhdr->b_flags |= arc_bufc_to_flags(type);
4857 nhdr->b_flags |= ARC_FLAG_HAS_L1HDR;
4858
4859 nhdr->b_l1hdr.b_buf = buf;
4860 nhdr->b_l1hdr.b_datacnt = 1;
4861 nhdr->b_l1hdr.b_state = arc_anon;
4862 nhdr->b_l1hdr.b_arc_access = 0;
ca0bf58d 4863 nhdr->b_l1hdr.b_tmp_cdata = NULL;
34dc7c2f 4864 nhdr->b_freeze_cksum = NULL;
b9541d6b
CW
4865
4866 (void) refcount_add(&nhdr->b_l1hdr.b_refcnt, tag);
34dc7c2f 4867 buf->b_hdr = nhdr;
428870ff 4868 mutex_exit(&buf->b_evict_lock);
36da08ef 4869 (void) refcount_add_many(&arc_anon->arcs_size, blksz, buf);
34dc7c2f 4870 } else {
428870ff 4871 mutex_exit(&buf->b_evict_lock);
b9541d6b 4872 ASSERT(refcount_count(&hdr->b_l1hdr.b_refcnt) == 1);
ca0bf58d
PS
4873 /* protected by hash lock, or hdr is on arc_anon */
4874 ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node));
34dc7c2f 4875 ASSERT(!HDR_IO_IN_PROGRESS(hdr));
b9541d6b
CW
4876 hdr->b_l1hdr.b_mru_hits = 0;
4877 hdr->b_l1hdr.b_mru_ghost_hits = 0;
4878 hdr->b_l1hdr.b_mfu_hits = 0;
4879 hdr->b_l1hdr.b_mfu_ghost_hits = 0;
4880 hdr->b_l1hdr.b_l2_hits = 0;
4881 arc_change_state(arc_anon, hdr, hash_lock);
4882 hdr->b_l1hdr.b_arc_access = 0;
4883 mutex_exit(hash_lock);
34dc7c2f 4884
428870ff 4885 buf_discard_identity(hdr);
34dc7c2f
BB
4886 arc_buf_thaw(buf);
4887 }
4888 buf->b_efunc = NULL;
4889 buf->b_private = NULL;
34dc7c2f
BB
4890}
4891
4892int
4893arc_released(arc_buf_t *buf)
4894{
b128c09f
BB
4895 int released;
4896
428870ff 4897 mutex_enter(&buf->b_evict_lock);
b9541d6b
CW
4898 released = (buf->b_data != NULL &&
4899 buf->b_hdr->b_l1hdr.b_state == arc_anon);
428870ff 4900 mutex_exit(&buf->b_evict_lock);
b128c09f 4901 return (released);
34dc7c2f
BB
4902}
4903
34dc7c2f
BB
4904#ifdef ZFS_DEBUG
4905int
4906arc_referenced(arc_buf_t *buf)
4907{
b128c09f
BB
4908 int referenced;
4909
428870ff 4910 mutex_enter(&buf->b_evict_lock);
b9541d6b 4911 referenced = (refcount_count(&buf->b_hdr->b_l1hdr.b_refcnt));
428870ff 4912 mutex_exit(&buf->b_evict_lock);
b128c09f 4913 return (referenced);
34dc7c2f
BB
4914}
4915#endif
4916
4917static void
4918arc_write_ready(zio_t *zio)
4919{
4920 arc_write_callback_t *callback = zio->io_private;
4921 arc_buf_t *buf = callback->awcb_buf;
4922 arc_buf_hdr_t *hdr = buf->b_hdr;
4923
b9541d6b
CW
4924 ASSERT(HDR_HAS_L1HDR(hdr));
4925 ASSERT(!refcount_is_zero(&buf->b_hdr->b_l1hdr.b_refcnt));
4926 ASSERT(hdr->b_l1hdr.b_datacnt > 0);
b128c09f
BB
4927 callback->awcb_ready(zio, buf, callback->awcb_private);
4928
34dc7c2f
BB
4929 /*
4930 * If the IO is already in progress, then this is a re-write
b128c09f
BB
4931 * attempt, so we need to thaw and re-compute the cksum.
4932 * It is the responsibility of the callback to handle the
4933 * accounting for any re-write attempt.
34dc7c2f
BB
4934 */
4935 if (HDR_IO_IN_PROGRESS(hdr)) {
b9541d6b 4936 mutex_enter(&hdr->b_l1hdr.b_freeze_lock);
34dc7c2f
BB
4937 if (hdr->b_freeze_cksum != NULL) {
4938 kmem_free(hdr->b_freeze_cksum, sizeof (zio_cksum_t));
4939 hdr->b_freeze_cksum = NULL;
4940 }
b9541d6b 4941 mutex_exit(&hdr->b_l1hdr.b_freeze_lock);
34dc7c2f
BB
4942 }
4943 arc_cksum_compute(buf, B_FALSE);
2a432414 4944 hdr->b_flags |= ARC_FLAG_IO_IN_PROGRESS;
34dc7c2f
BB
4945}
4946
e8b96c60
MA
4947/*
4948 * The SPA calls this callback for each physical write that happens on behalf
4949 * of a logical write. See the comment in dbuf_write_physdone() for details.
4950 */
4951static void
4952arc_write_physdone(zio_t *zio)
4953{
4954 arc_write_callback_t *cb = zio->io_private;
4955 if (cb->awcb_physdone != NULL)
4956 cb->awcb_physdone(zio, cb->awcb_buf, cb->awcb_private);
4957}
4958
34dc7c2f
BB
4959static void
4960arc_write_done(zio_t *zio)
4961{
4962 arc_write_callback_t *callback = zio->io_private;
4963 arc_buf_t *buf = callback->awcb_buf;
4964 arc_buf_hdr_t *hdr = buf->b_hdr;
4965
b9541d6b 4966 ASSERT(hdr->b_l1hdr.b_acb == NULL);
428870ff
BB
4967
4968 if (zio->io_error == 0) {
9b67f605 4969 if (BP_IS_HOLE(zio->io_bp) || BP_IS_EMBEDDED(zio->io_bp)) {
b0bc7a84
MG
4970 buf_discard_identity(hdr);
4971 } else {
4972 hdr->b_dva = *BP_IDENTITY(zio->io_bp);
4973 hdr->b_birth = BP_PHYSICAL_BIRTH(zio->io_bp);
b0bc7a84 4974 }
428870ff
BB
4975 } else {
4976 ASSERT(BUF_EMPTY(hdr));
4977 }
34dc7c2f 4978
34dc7c2f 4979 /*
9b67f605
MA
4980 * If the block to be written was all-zero or compressed enough to be
4981 * embedded in the BP, no write was performed so there will be no
4982 * dva/birth/checksum. The buffer must therefore remain anonymous
4983 * (and uncached).
34dc7c2f
BB
4984 */
4985 if (!BUF_EMPTY(hdr)) {
4986 arc_buf_hdr_t *exists;
4987 kmutex_t *hash_lock;
4988
428870ff
BB
4989 ASSERT(zio->io_error == 0);
4990
34dc7c2f
BB
4991 arc_cksum_verify(buf);
4992
4993 exists = buf_hash_insert(hdr, &hash_lock);
b9541d6b 4994 if (exists != NULL) {
34dc7c2f
BB
4995 /*
4996 * This can only happen if we overwrite for
4997 * sync-to-convergence, because we remove
4998 * buffers from the hash table when we arc_free().
4999 */
428870ff
BB
5000 if (zio->io_flags & ZIO_FLAG_IO_REWRITE) {
5001 if (!BP_EQUAL(&zio->io_bp_orig, zio->io_bp))
5002 panic("bad overwrite, hdr=%p exists=%p",
5003 (void *)hdr, (void *)exists);
b9541d6b
CW
5004 ASSERT(refcount_is_zero(
5005 &exists->b_l1hdr.b_refcnt));
428870ff
BB
5006 arc_change_state(arc_anon, exists, hash_lock);
5007 mutex_exit(hash_lock);
5008 arc_hdr_destroy(exists);
5009 exists = buf_hash_insert(hdr, &hash_lock);
5010 ASSERT3P(exists, ==, NULL);
03c6040b
GW
5011 } else if (zio->io_flags & ZIO_FLAG_NOPWRITE) {
5012 /* nopwrite */
5013 ASSERT(zio->io_prop.zp_nopwrite);
5014 if (!BP_EQUAL(&zio->io_bp_orig, zio->io_bp))
5015 panic("bad nopwrite, hdr=%p exists=%p",
5016 (void *)hdr, (void *)exists);
428870ff
BB
5017 } else {
5018 /* Dedup */
b9541d6b
CW
5019 ASSERT(hdr->b_l1hdr.b_datacnt == 1);
5020 ASSERT(hdr->b_l1hdr.b_state == arc_anon);
428870ff
BB
5021 ASSERT(BP_GET_DEDUP(zio->io_bp));
5022 ASSERT(BP_GET_LEVEL(zio->io_bp) == 0);
5023 }
34dc7c2f 5024 }
2a432414 5025 hdr->b_flags &= ~ARC_FLAG_IO_IN_PROGRESS;
b128c09f 5026 /* if it's not anon, we are doing a scrub */
b9541d6b 5027 if (exists == NULL && hdr->b_l1hdr.b_state == arc_anon)
b128c09f 5028 arc_access(hdr, hash_lock);
34dc7c2f 5029 mutex_exit(hash_lock);
34dc7c2f 5030 } else {
2a432414 5031 hdr->b_flags &= ~ARC_FLAG_IO_IN_PROGRESS;
34dc7c2f
BB
5032 }
5033
b9541d6b 5034 ASSERT(!refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
428870ff 5035 callback->awcb_done(zio, buf, callback->awcb_private);
34dc7c2f
BB
5036
5037 kmem_free(callback, sizeof (arc_write_callback_t));
5038}
5039
5040zio_t *
428870ff 5041arc_write(zio_t *pio, spa_t *spa, uint64_t txg,
3a17a7a9 5042 blkptr_t *bp, arc_buf_t *buf, boolean_t l2arc, boolean_t l2arc_compress,
e8b96c60
MA
5043 const zio_prop_t *zp, arc_done_func_t *ready, arc_done_func_t *physdone,
5044 arc_done_func_t *done, void *private, zio_priority_t priority,
5dbd68a3 5045 int zio_flags, const zbookmark_phys_t *zb)
34dc7c2f
BB
5046{
5047 arc_buf_hdr_t *hdr = buf->b_hdr;
5048 arc_write_callback_t *callback;
b128c09f 5049 zio_t *zio;
34dc7c2f 5050
b128c09f 5051 ASSERT(ready != NULL);
428870ff 5052 ASSERT(done != NULL);
34dc7c2f 5053 ASSERT(!HDR_IO_ERROR(hdr));
b9541d6b
CW
5054 ASSERT(!HDR_IO_IN_PROGRESS(hdr));
5055 ASSERT(hdr->b_l1hdr.b_acb == NULL);
5056 ASSERT(hdr->b_l1hdr.b_datacnt > 0);
b128c09f 5057 if (l2arc)
2a432414 5058 hdr->b_flags |= ARC_FLAG_L2CACHE;
3a17a7a9 5059 if (l2arc_compress)
2a432414 5060 hdr->b_flags |= ARC_FLAG_L2COMPRESS;
79c76d5b 5061 callback = kmem_zalloc(sizeof (arc_write_callback_t), KM_SLEEP);
34dc7c2f 5062 callback->awcb_ready = ready;
e8b96c60 5063 callback->awcb_physdone = physdone;
34dc7c2f
BB
5064 callback->awcb_done = done;
5065 callback->awcb_private = private;
5066 callback->awcb_buf = buf;
b128c09f 5067
428870ff 5068 zio = zio_write(pio, spa, txg, bp, buf->b_data, hdr->b_size, zp,
e8b96c60
MA
5069 arc_write_ready, arc_write_physdone, arc_write_done, callback,
5070 priority, zio_flags, zb);
34dc7c2f
BB
5071
5072 return (zio);
5073}
5074
34dc7c2f 5075static int
e8b96c60 5076arc_memory_throttle(uint64_t reserve, uint64_t txg)
34dc7c2f
BB
5077{
5078#ifdef _KERNEL
0c5493d4
BB
5079 if (zfs_arc_memory_throttle_disable)
5080 return (0);
5081
ca67b33a
MA
5082 if (freemem > physmem * arc_lotsfree_percent / 100)
5083 return (0);
5084
5085 if (arc_reclaim_needed()) {
5086 /* memory is low, delay before restarting */
34dc7c2f 5087 ARCSTAT_INCR(arcstat_memory_throttle_count, 1);
570827e1 5088 DMU_TX_STAT_BUMP(dmu_tx_memory_reclaim);
2e528b49 5089 return (SET_ERROR(EAGAIN));
34dc7c2f 5090 }
34dc7c2f
BB
5091#endif
5092 return (0);
5093}
5094
5095void
5096arc_tempreserve_clear(uint64_t reserve)
5097{
5098 atomic_add_64(&arc_tempreserve, -reserve);
5099 ASSERT((int64_t)arc_tempreserve >= 0);
5100}
5101
5102int
5103arc_tempreserve_space(uint64_t reserve, uint64_t txg)
5104{
5105 int error;
9babb374 5106 uint64_t anon_size;
34dc7c2f 5107
34dc7c2f
BB
5108 if (reserve > arc_c/4 && !arc_no_grow)
5109 arc_c = MIN(arc_c_max, reserve * 4);
12f9a6a3
BB
5110
5111 /*
5112 * Throttle when the calculated memory footprint for the TXG
5113 * exceeds the target ARC size.
5114 */
570827e1
BB
5115 if (reserve > arc_c) {
5116 DMU_TX_STAT_BUMP(dmu_tx_memory_reserve);
12f9a6a3 5117 return (SET_ERROR(ERESTART));
570827e1 5118 }
34dc7c2f 5119
9babb374
BB
5120 /*
5121 * Don't count loaned bufs as in flight dirty data to prevent long
5122 * network delays from blocking transactions that are ready to be
5123 * assigned to a txg.
5124 */
36da08ef
PS
5125 anon_size = MAX((int64_t)(refcount_count(&arc_anon->arcs_size) -
5126 arc_loaned_bytes), 0);
9babb374 5127
34dc7c2f
BB
5128 /*
5129 * Writes will, almost always, require additional memory allocations
d3cc8b15 5130 * in order to compress/encrypt/etc the data. We therefore need to
34dc7c2f
BB
5131 * make sure that there is sufficient available memory for this.
5132 */
e8b96c60
MA
5133 error = arc_memory_throttle(reserve, txg);
5134 if (error != 0)
34dc7c2f
BB
5135 return (error);
5136
5137 /*
5138 * Throttle writes when the amount of dirty data in the cache
5139 * gets too large. We try to keep the cache less than half full
5140 * of dirty blocks so that our sync times don't grow too large.
5141 * Note: if two requests come in concurrently, we might let them
5142 * both succeed, when one of them should fail. Not a huge deal.
5143 */
9babb374
BB
5144
5145 if (reserve + arc_tempreserve + anon_size > arc_c / 2 &&
5146 anon_size > arc_c / 4) {
34dc7c2f
BB
5147 dprintf("failing, arc_tempreserve=%lluK anon_meta=%lluK "
5148 "anon_data=%lluK tempreserve=%lluK arc_c=%lluK\n",
5149 arc_tempreserve>>10,
5150 arc_anon->arcs_lsize[ARC_BUFC_METADATA]>>10,
5151 arc_anon->arcs_lsize[ARC_BUFC_DATA]>>10,
5152 reserve>>10, arc_c>>10);
570827e1 5153 DMU_TX_STAT_BUMP(dmu_tx_dirty_throttle);
2e528b49 5154 return (SET_ERROR(ERESTART));
34dc7c2f
BB
5155 }
5156 atomic_add_64(&arc_tempreserve, reserve);
5157 return (0);
5158}
5159
13be560d
BB
5160static void
5161arc_kstat_update_state(arc_state_t *state, kstat_named_t *size,
5162 kstat_named_t *evict_data, kstat_named_t *evict_metadata)
5163{
36da08ef 5164 size->value.ui64 = refcount_count(&state->arcs_size);
13be560d
BB
5165 evict_data->value.ui64 = state->arcs_lsize[ARC_BUFC_DATA];
5166 evict_metadata->value.ui64 = state->arcs_lsize[ARC_BUFC_METADATA];
5167}
5168
5169static int
5170arc_kstat_update(kstat_t *ksp, int rw)
5171{
5172 arc_stats_t *as = ksp->ks_data;
5173
5174 if (rw == KSTAT_WRITE) {
500445c0 5175 return (EACCES);
13be560d
BB
5176 } else {
5177 arc_kstat_update_state(arc_anon,
5178 &as->arcstat_anon_size,
500445c0
PS
5179 &as->arcstat_anon_evictable_data,
5180 &as->arcstat_anon_evictable_metadata);
13be560d
BB
5181 arc_kstat_update_state(arc_mru,
5182 &as->arcstat_mru_size,
500445c0
PS
5183 &as->arcstat_mru_evictable_data,
5184 &as->arcstat_mru_evictable_metadata);
13be560d
BB
5185 arc_kstat_update_state(arc_mru_ghost,
5186 &as->arcstat_mru_ghost_size,
500445c0
PS
5187 &as->arcstat_mru_ghost_evictable_data,
5188 &as->arcstat_mru_ghost_evictable_metadata);
13be560d
BB
5189 arc_kstat_update_state(arc_mfu,
5190 &as->arcstat_mfu_size,
500445c0
PS
5191 &as->arcstat_mfu_evictable_data,
5192 &as->arcstat_mfu_evictable_metadata);
fc41c640 5193 arc_kstat_update_state(arc_mfu_ghost,
13be560d 5194 &as->arcstat_mfu_ghost_size,
500445c0
PS
5195 &as->arcstat_mfu_ghost_evictable_data,
5196 &as->arcstat_mfu_ghost_evictable_metadata);
13be560d
BB
5197 }
5198
5199 return (0);
5200}
5201
ca0bf58d
PS
5202/*
5203 * This function *must* return indices evenly distributed between all
5204 * sublists of the multilist. This is needed due to how the ARC eviction
5205 * code is laid out; arc_evict_state() assumes ARC buffers are evenly
5206 * distributed between all sublists and uses this assumption when
5207 * deciding which sublist to evict from and how much to evict from it.
5208 */
5209unsigned int
5210arc_state_multilist_index_func(multilist_t *ml, void *obj)
5211{
5212 arc_buf_hdr_t *hdr = obj;
5213
5214 /*
5215 * We rely on b_dva to generate evenly distributed index
5216 * numbers using buf_hash below. So, as an added precaution,
5217 * let's make sure we never add empty buffers to the arc lists.
5218 */
5219 ASSERT(!BUF_EMPTY(hdr));
5220
5221 /*
5222 * The assumption here, is the hash value for a given
5223 * arc_buf_hdr_t will remain constant throughout its lifetime
5224 * (i.e. its b_spa, b_dva, and b_birth fields don't change).
5225 * Thus, we don't need to store the header's sublist index
5226 * on insertion, as this index can be recalculated on removal.
5227 *
5228 * Also, the low order bits of the hash value are thought to be
5229 * distributed evenly. Otherwise, in the case that the multilist
5230 * has a power of two number of sublists, each sublists' usage
5231 * would not be evenly distributed.
5232 */
5233 return (buf_hash(hdr->b_spa, &hdr->b_dva, hdr->b_birth) %
5234 multilist_get_num_sublists(ml));
5235}
5236
ca67b33a
MA
5237/*
5238 * Called during module initialization and periodically thereafter to
5239 * apply reasonable changes to the exposed performance tunings. Non-zero
5240 * zfs_* values which differ from the currently set values will be applied.
5241 */
5242static void
5243arc_tuning_update(void)
5244{
5245 /* Valid range: 64M - <all physical memory> */
5246 if ((zfs_arc_max) && (zfs_arc_max != arc_c_max) &&
5247 (zfs_arc_max > 64 << 20) && (zfs_arc_max < ptob(physmem)) &&
5248 (zfs_arc_max > arc_c_min)) {
5249 arc_c_max = zfs_arc_max;
5250 arc_c = arc_c_max;
5251 arc_p = (arc_c >> 1);
5252 arc_meta_limit = MIN(arc_meta_limit, arc_c_max);
5253 }
5254
5255 /* Valid range: 32M - <arc_c_max> */
5256 if ((zfs_arc_min) && (zfs_arc_min != arc_c_min) &&
5257 (zfs_arc_min >= 2ULL << SPA_MAXBLOCKSHIFT) &&
5258 (zfs_arc_min <= arc_c_max)) {
5259 arc_c_min = zfs_arc_min;
5260 arc_c = MAX(arc_c, arc_c_min);
5261 }
5262
5263 /* Valid range: 16M - <arc_c_max> */
5264 if ((zfs_arc_meta_min) && (zfs_arc_meta_min != arc_meta_min) &&
5265 (zfs_arc_meta_min >= 1ULL << SPA_MAXBLOCKSHIFT) &&
5266 (zfs_arc_meta_min <= arc_c_max)) {
5267 arc_meta_min = zfs_arc_meta_min;
5268 arc_meta_limit = MAX(arc_meta_limit, arc_meta_min);
5269 }
5270
5271 /* Valid range: <arc_meta_min> - <arc_c_max> */
5272 if ((zfs_arc_meta_limit) && (zfs_arc_meta_limit != arc_meta_limit) &&
5273 (zfs_arc_meta_limit >= zfs_arc_meta_min) &&
5274 (zfs_arc_meta_limit <= arc_c_max))
5275 arc_meta_limit = zfs_arc_meta_limit;
5276
5277 /* Valid range: 1 - N */
5278 if (zfs_arc_grow_retry)
5279 arc_grow_retry = zfs_arc_grow_retry;
5280
5281 /* Valid range: 1 - N */
5282 if (zfs_arc_shrink_shift) {
5283 arc_shrink_shift = zfs_arc_shrink_shift;
5284 arc_no_grow_shift = MIN(arc_no_grow_shift, arc_shrink_shift -1);
5285 }
5286
728d6ae9
BB
5287 /* Valid range: 1 - N */
5288 if (zfs_arc_p_min_shift)
5289 arc_p_min_shift = zfs_arc_p_min_shift;
5290
ca67b33a
MA
5291 /* Valid range: 1 - N ticks */
5292 if (zfs_arc_min_prefetch_lifespan)
5293 arc_min_prefetch_lifespan = zfs_arc_min_prefetch_lifespan;
5294}
5295
34dc7c2f
BB
5296void
5297arc_init(void)
5298{
ca67b33a
MA
5299 /*
5300 * allmem is "all memory that we could possibly use".
5301 */
5302#ifdef _KERNEL
5303 uint64_t allmem = ptob(physmem);
5304#else
5305 uint64_t allmem = (physmem * PAGESIZE) / 2;
5306#endif
5307
ca0bf58d
PS
5308 mutex_init(&arc_reclaim_lock, NULL, MUTEX_DEFAULT, NULL);
5309 cv_init(&arc_reclaim_thread_cv, NULL, CV_DEFAULT, NULL);
5310 cv_init(&arc_reclaim_waiters_cv, NULL, CV_DEFAULT, NULL);
5311
5312 mutex_init(&arc_user_evicts_lock, NULL, MUTEX_DEFAULT, NULL);
5313 cv_init(&arc_user_evicts_cv, NULL, CV_DEFAULT, NULL);
34dc7c2f
BB
5314
5315 /* Convert seconds to clock ticks */
ca67b33a 5316 arc_min_prefetch_lifespan = 1 * hz;
34dc7c2f
BB
5317
5318 /* Start out with 1/8 of all memory */
ca67b33a 5319 arc_c = allmem / 8;
34dc7c2f
BB
5320
5321#ifdef _KERNEL
5322 /*
5323 * On architectures where the physical memory can be larger
5324 * than the addressable space (intel in 32-bit mode), we may
5325 * need to limit the cache to 1/8 of VM size.
5326 */
5327 arc_c = MIN(arc_c, vmem_size(heap_arena, VMEM_ALLOC | VMEM_FREE) / 8);
ca67b33a 5328
7cb67b45
BB
5329 /*
5330 * Register a shrinker to support synchronous (direct) memory
5331 * reclaim from the arc. This is done to prevent kswapd from
5332 * swapping out pages when it is preferable to shrink the arc.
5333 */
5334 spl_register_shrinker(&arc_shrinker);
34dc7c2f
BB
5335#endif
5336
ca67b33a 5337 /* Set min cache to allow safe operation of arc_adapt() */
121b3cae 5338 arc_c_min = 2ULL << SPA_MAXBLOCKSHIFT;
ca67b33a
MA
5339 /* Set max to 1/2 of all memory */
5340 arc_c_max = allmem / 2;
34dc7c2f
BB
5341
5342 arc_c = arc_c_max;
5343 arc_p = (arc_c >> 1);
5344
ca67b33a
MA
5345 /* Set min to 1/2 of arc_c_min */
5346 arc_meta_min = 1ULL << SPA_MAXBLOCKSHIFT;
5347 /* Initialize maximum observed usage to zero */
1834f2d8 5348 arc_meta_max = 0;
ca67b33a
MA
5349 /* Set limit to 3/4 of arc_c_max with a floor of arc_meta_min */
5350 arc_meta_limit = MAX((3 * arc_c_max) / 4, arc_meta_min);
34dc7c2f 5351
ca67b33a
MA
5352 /* Apply user specified tunings */
5353 arc_tuning_update();
c52fca13 5354
ca0bf58d 5355 if (zfs_arc_num_sublists_per_state < 1)
ca67b33a 5356 zfs_arc_num_sublists_per_state = MAX(boot_ncpus, 1);
ca0bf58d 5357
34dc7c2f
BB
5358 /* if kmem_flags are set, lets try to use less memory */
5359 if (kmem_debugging())
5360 arc_c = arc_c / 2;
5361 if (arc_c < arc_c_min)
5362 arc_c = arc_c_min;
5363
5364 arc_anon = &ARC_anon;
5365 arc_mru = &ARC_mru;
5366 arc_mru_ghost = &ARC_mru_ghost;
5367 arc_mfu = &ARC_mfu;
5368 arc_mfu_ghost = &ARC_mfu_ghost;
5369 arc_l2c_only = &ARC_l2c_only;
5370 arc_size = 0;
5371
ca0bf58d 5372 multilist_create(&arc_mru->arcs_list[ARC_BUFC_METADATA],
b9541d6b 5373 sizeof (arc_buf_hdr_t),
ca0bf58d
PS
5374 offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node),
5375 zfs_arc_num_sublists_per_state, arc_state_multilist_index_func);
5376 multilist_create(&arc_mru->arcs_list[ARC_BUFC_DATA],
b9541d6b 5377 sizeof (arc_buf_hdr_t),
ca0bf58d
PS
5378 offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node),
5379 zfs_arc_num_sublists_per_state, arc_state_multilist_index_func);
5380 multilist_create(&arc_mru_ghost->arcs_list[ARC_BUFC_METADATA],
b9541d6b 5381 sizeof (arc_buf_hdr_t),
ca0bf58d
PS
5382 offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node),
5383 zfs_arc_num_sublists_per_state, arc_state_multilist_index_func);
5384 multilist_create(&arc_mru_ghost->arcs_list[ARC_BUFC_DATA],
b9541d6b 5385 sizeof (arc_buf_hdr_t),
ca0bf58d
PS
5386 offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node),
5387 zfs_arc_num_sublists_per_state, arc_state_multilist_index_func);
5388 multilist_create(&arc_mfu->arcs_list[ARC_BUFC_METADATA],
b9541d6b 5389 sizeof (arc_buf_hdr_t),
ca0bf58d
PS
5390 offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node),
5391 zfs_arc_num_sublists_per_state, arc_state_multilist_index_func);
5392 multilist_create(&arc_mfu->arcs_list[ARC_BUFC_DATA],
b9541d6b 5393 sizeof (arc_buf_hdr_t),
ca0bf58d
PS
5394 offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node),
5395 zfs_arc_num_sublists_per_state, arc_state_multilist_index_func);
5396 multilist_create(&arc_mfu_ghost->arcs_list[ARC_BUFC_METADATA],
b9541d6b 5397 sizeof (arc_buf_hdr_t),
ca0bf58d
PS
5398 offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node),
5399 zfs_arc_num_sublists_per_state, arc_state_multilist_index_func);
5400 multilist_create(&arc_mfu_ghost->arcs_list[ARC_BUFC_DATA],
b9541d6b 5401 sizeof (arc_buf_hdr_t),
ca0bf58d
PS
5402 offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node),
5403 zfs_arc_num_sublists_per_state, arc_state_multilist_index_func);
5404 multilist_create(&arc_l2c_only->arcs_list[ARC_BUFC_METADATA],
b9541d6b 5405 sizeof (arc_buf_hdr_t),
ca0bf58d
PS
5406 offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node),
5407 zfs_arc_num_sublists_per_state, arc_state_multilist_index_func);
5408 multilist_create(&arc_l2c_only->arcs_list[ARC_BUFC_DATA],
b9541d6b 5409 sizeof (arc_buf_hdr_t),
ca0bf58d
PS
5410 offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node),
5411 zfs_arc_num_sublists_per_state, arc_state_multilist_index_func);
34dc7c2f 5412
e0b0ca98
BB
5413 arc_anon->arcs_state = ARC_STATE_ANON;
5414 arc_mru->arcs_state = ARC_STATE_MRU;
5415 arc_mru_ghost->arcs_state = ARC_STATE_MRU_GHOST;
5416 arc_mfu->arcs_state = ARC_STATE_MFU;
5417 arc_mfu_ghost->arcs_state = ARC_STATE_MFU_GHOST;
5418 arc_l2c_only->arcs_state = ARC_STATE_L2C_ONLY;
5419
36da08ef
PS
5420 refcount_create(&arc_anon->arcs_size);
5421 refcount_create(&arc_mru->arcs_size);
5422 refcount_create(&arc_mru_ghost->arcs_size);
5423 refcount_create(&arc_mfu->arcs_size);
5424 refcount_create(&arc_mfu_ghost->arcs_size);
5425 refcount_create(&arc_l2c_only->arcs_size);
5426
34dc7c2f
BB
5427 buf_init();
5428
ca0bf58d
PS
5429 arc_reclaim_thread_exit = FALSE;
5430 arc_user_evicts_thread_exit = FALSE;
ab26409d
BB
5431 list_create(&arc_prune_list, sizeof (arc_prune_t),
5432 offsetof(arc_prune_t, p_node));
34dc7c2f 5433 arc_eviction_list = NULL;
ab26409d 5434 mutex_init(&arc_prune_mtx, NULL, MUTEX_DEFAULT, NULL);
34dc7c2f
BB
5435 bzero(&arc_eviction_hdr, sizeof (arc_buf_hdr_t));
5436
f6046738 5437 arc_prune_taskq = taskq_create("arc_prune", max_ncpus, minclsyspri,
aa9af22c 5438 max_ncpus, INT_MAX, TASKQ_PREPOPULATE | TASKQ_DYNAMIC);
f6046738 5439
34dc7c2f
BB
5440 arc_ksp = kstat_create("zfs", 0, "arcstats", "misc", KSTAT_TYPE_NAMED,
5441 sizeof (arc_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL);
5442
5443 if (arc_ksp != NULL) {
5444 arc_ksp->ks_data = &arc_stats;
13be560d 5445 arc_ksp->ks_update = arc_kstat_update;
34dc7c2f
BB
5446 kstat_install(arc_ksp);
5447 }
5448
ca67b33a 5449 (void) thread_create(NULL, 0, arc_reclaim_thread, NULL, 0, &p0,
34dc7c2f
BB
5450 TS_RUN, minclsyspri);
5451
ca0bf58d
PS
5452 (void) thread_create(NULL, 0, arc_user_evicts_thread, NULL, 0, &p0,
5453 TS_RUN, minclsyspri);
5454
34dc7c2f 5455 arc_dead = FALSE;
b128c09f 5456 arc_warm = B_FALSE;
34dc7c2f 5457
e8b96c60
MA
5458 /*
5459 * Calculate maximum amount of dirty data per pool.
5460 *
5461 * If it has been set by a module parameter, take that.
5462 * Otherwise, use a percentage of physical memory defined by
5463 * zfs_dirty_data_max_percent (default 10%) with a cap at
5464 * zfs_dirty_data_max_max (default 25% of physical memory).
5465 */
5466 if (zfs_dirty_data_max_max == 0)
5467 zfs_dirty_data_max_max = physmem * PAGESIZE *
5468 zfs_dirty_data_max_max_percent / 100;
5469
5470 if (zfs_dirty_data_max == 0) {
5471 zfs_dirty_data_max = physmem * PAGESIZE *
5472 zfs_dirty_data_max_percent / 100;
5473 zfs_dirty_data_max = MIN(zfs_dirty_data_max,
5474 zfs_dirty_data_max_max);
5475 }
34dc7c2f
BB
5476}
5477
5478void
5479arc_fini(void)
5480{
ab26409d
BB
5481 arc_prune_t *p;
5482
7cb67b45
BB
5483#ifdef _KERNEL
5484 spl_unregister_shrinker(&arc_shrinker);
5485#endif /* _KERNEL */
5486
ca0bf58d
PS
5487 mutex_enter(&arc_reclaim_lock);
5488 arc_reclaim_thread_exit = TRUE;
5489 /*
5490 * The reclaim thread will set arc_reclaim_thread_exit back to
5491 * FALSE when it is finished exiting; we're waiting for that.
5492 */
5493 while (arc_reclaim_thread_exit) {
5494 cv_signal(&arc_reclaim_thread_cv);
5495 cv_wait(&arc_reclaim_thread_cv, &arc_reclaim_lock);
5496 }
5497 mutex_exit(&arc_reclaim_lock);
5498
5499 mutex_enter(&arc_user_evicts_lock);
5500 arc_user_evicts_thread_exit = TRUE;
5501 /*
5502 * The user evicts thread will set arc_user_evicts_thread_exit
5503 * to FALSE when it is finished exiting; we're waiting for that.
5504 */
5505 while (arc_user_evicts_thread_exit) {
5506 cv_signal(&arc_user_evicts_cv);
5507 cv_wait(&arc_user_evicts_cv, &arc_user_evicts_lock);
5508 }
5509 mutex_exit(&arc_user_evicts_lock);
34dc7c2f 5510
ca0bf58d
PS
5511 /* Use TRUE to ensure *all* buffers are evicted */
5512 arc_flush(NULL, TRUE);
34dc7c2f
BB
5513
5514 arc_dead = TRUE;
5515
5516 if (arc_ksp != NULL) {
5517 kstat_delete(arc_ksp);
5518 arc_ksp = NULL;
5519 }
5520
f6046738
BB
5521 taskq_wait(arc_prune_taskq);
5522 taskq_destroy(arc_prune_taskq);
5523
ab26409d
BB
5524 mutex_enter(&arc_prune_mtx);
5525 while ((p = list_head(&arc_prune_list)) != NULL) {
5526 list_remove(&arc_prune_list, p);
5527 refcount_remove(&p->p_refcnt, &arc_prune_list);
5528 refcount_destroy(&p->p_refcnt);
5529 kmem_free(p, sizeof (*p));
5530 }
5531 mutex_exit(&arc_prune_mtx);
5532
5533 list_destroy(&arc_prune_list);
5534 mutex_destroy(&arc_prune_mtx);
ca0bf58d
PS
5535 mutex_destroy(&arc_reclaim_lock);
5536 cv_destroy(&arc_reclaim_thread_cv);
5537 cv_destroy(&arc_reclaim_waiters_cv);
5538
5539 mutex_destroy(&arc_user_evicts_lock);
5540 cv_destroy(&arc_user_evicts_cv);
5541
36da08ef
PS
5542 refcount_destroy(&arc_anon->arcs_size);
5543 refcount_destroy(&arc_mru->arcs_size);
5544 refcount_destroy(&arc_mru_ghost->arcs_size);
5545 refcount_destroy(&arc_mfu->arcs_size);
5546 refcount_destroy(&arc_mfu_ghost->arcs_size);
5547 refcount_destroy(&arc_l2c_only->arcs_size);
5548
ca0bf58d
PS
5549 multilist_destroy(&arc_mru->arcs_list[ARC_BUFC_METADATA]);
5550 multilist_destroy(&arc_mru_ghost->arcs_list[ARC_BUFC_METADATA]);
5551 multilist_destroy(&arc_mfu->arcs_list[ARC_BUFC_METADATA]);
5552 multilist_destroy(&arc_mfu_ghost->arcs_list[ARC_BUFC_METADATA]);
5553 multilist_destroy(&arc_mru->arcs_list[ARC_BUFC_DATA]);
5554 multilist_destroy(&arc_mru_ghost->arcs_list[ARC_BUFC_DATA]);
5555 multilist_destroy(&arc_mfu->arcs_list[ARC_BUFC_DATA]);
5556 multilist_destroy(&arc_mfu_ghost->arcs_list[ARC_BUFC_DATA]);
5557 multilist_destroy(&arc_l2c_only->arcs_list[ARC_BUFC_METADATA]);
5558 multilist_destroy(&arc_l2c_only->arcs_list[ARC_BUFC_DATA]);
34dc7c2f
BB
5559
5560 buf_fini();
9babb374 5561
b9541d6b 5562 ASSERT0(arc_loaned_bytes);
34dc7c2f
BB
5563}
5564
5565/*
5566 * Level 2 ARC
5567 *
5568 * The level 2 ARC (L2ARC) is a cache layer in-between main memory and disk.
5569 * It uses dedicated storage devices to hold cached data, which are populated
5570 * using large infrequent writes. The main role of this cache is to boost
5571 * the performance of random read workloads. The intended L2ARC devices
5572 * include short-stroked disks, solid state disks, and other media with
5573 * substantially faster read latency than disk.
5574 *
5575 * +-----------------------+
5576 * | ARC |
5577 * +-----------------------+
5578 * | ^ ^
5579 * | | |
5580 * l2arc_feed_thread() arc_read()
5581 * | | |
5582 * | l2arc read |
5583 * V | |
5584 * +---------------+ |
5585 * | L2ARC | |
5586 * +---------------+ |
5587 * | ^ |
5588 * l2arc_write() | |
5589 * | | |
5590 * V | |
5591 * +-------+ +-------+
5592 * | vdev | | vdev |
5593 * | cache | | cache |
5594 * +-------+ +-------+
5595 * +=========+ .-----.
5596 * : L2ARC : |-_____-|
5597 * : devices : | Disks |
5598 * +=========+ `-_____-'
5599 *
5600 * Read requests are satisfied from the following sources, in order:
5601 *
5602 * 1) ARC
5603 * 2) vdev cache of L2ARC devices
5604 * 3) L2ARC devices
5605 * 4) vdev cache of disks
5606 * 5) disks
5607 *
5608 * Some L2ARC device types exhibit extremely slow write performance.
5609 * To accommodate for this there are some significant differences between
5610 * the L2ARC and traditional cache design:
5611 *
5612 * 1. There is no eviction path from the ARC to the L2ARC. Evictions from
5613 * the ARC behave as usual, freeing buffers and placing headers on ghost
5614 * lists. The ARC does not send buffers to the L2ARC during eviction as
5615 * this would add inflated write latencies for all ARC memory pressure.
5616 *
5617 * 2. The L2ARC attempts to cache data from the ARC before it is evicted.
5618 * It does this by periodically scanning buffers from the eviction-end of
5619 * the MFU and MRU ARC lists, copying them to the L2ARC devices if they are
3a17a7a9
SK
5620 * not already there. It scans until a headroom of buffers is satisfied,
5621 * which itself is a buffer for ARC eviction. If a compressible buffer is
5622 * found during scanning and selected for writing to an L2ARC device, we
5623 * temporarily boost scanning headroom during the next scan cycle to make
5624 * sure we adapt to compression effects (which might significantly reduce
5625 * the data volume we write to L2ARC). The thread that does this is
34dc7c2f
BB
5626 * l2arc_feed_thread(), illustrated below; example sizes are included to
5627 * provide a better sense of ratio than this diagram:
5628 *
5629 * head --> tail
5630 * +---------------------+----------+
5631 * ARC_mfu |:::::#:::::::::::::::|o#o###o###|-->. # already on L2ARC
5632 * +---------------------+----------+ | o L2ARC eligible
5633 * ARC_mru |:#:::::::::::::::::::|#o#ooo####|-->| : ARC buffer
5634 * +---------------------+----------+ |
5635 * 15.9 Gbytes ^ 32 Mbytes |
5636 * headroom |
5637 * l2arc_feed_thread()
5638 * |
5639 * l2arc write hand <--[oooo]--'
5640 * | 8 Mbyte
5641 * | write max
5642 * V
5643 * +==============================+
5644 * L2ARC dev |####|#|###|###| |####| ... |
5645 * +==============================+
5646 * 32 Gbytes
5647 *
5648 * 3. If an ARC buffer is copied to the L2ARC but then hit instead of
5649 * evicted, then the L2ARC has cached a buffer much sooner than it probably
5650 * needed to, potentially wasting L2ARC device bandwidth and storage. It is
5651 * safe to say that this is an uncommon case, since buffers at the end of
5652 * the ARC lists have moved there due to inactivity.
5653 *
5654 * 4. If the ARC evicts faster than the L2ARC can maintain a headroom,
5655 * then the L2ARC simply misses copying some buffers. This serves as a
5656 * pressure valve to prevent heavy read workloads from both stalling the ARC
5657 * with waits and clogging the L2ARC with writes. This also helps prevent
5658 * the potential for the L2ARC to churn if it attempts to cache content too
5659 * quickly, such as during backups of the entire pool.
5660 *
b128c09f
BB
5661 * 5. After system boot and before the ARC has filled main memory, there are
5662 * no evictions from the ARC and so the tails of the ARC_mfu and ARC_mru
5663 * lists can remain mostly static. Instead of searching from tail of these
5664 * lists as pictured, the l2arc_feed_thread() will search from the list heads
5665 * for eligible buffers, greatly increasing its chance of finding them.
5666 *
5667 * The L2ARC device write speed is also boosted during this time so that
5668 * the L2ARC warms up faster. Since there have been no ARC evictions yet,
5669 * there are no L2ARC reads, and no fear of degrading read performance
5670 * through increased writes.
5671 *
5672 * 6. Writes to the L2ARC devices are grouped and sent in-sequence, so that
34dc7c2f
BB
5673 * the vdev queue can aggregate them into larger and fewer writes. Each
5674 * device is written to in a rotor fashion, sweeping writes through
5675 * available space then repeating.
5676 *
b128c09f 5677 * 7. The L2ARC does not store dirty content. It never needs to flush
34dc7c2f
BB
5678 * write buffers back to disk based storage.
5679 *
b128c09f 5680 * 8. If an ARC buffer is written (and dirtied) which also exists in the
34dc7c2f
BB
5681 * L2ARC, the now stale L2ARC buffer is immediately dropped.
5682 *
5683 * The performance of the L2ARC can be tweaked by a number of tunables, which
5684 * may be necessary for different workloads:
5685 *
5686 * l2arc_write_max max write bytes per interval
b128c09f 5687 * l2arc_write_boost extra write bytes during device warmup
34dc7c2f 5688 * l2arc_noprefetch skip caching prefetched buffers
3a17a7a9 5689 * l2arc_nocompress skip compressing buffers
34dc7c2f 5690 * l2arc_headroom number of max device writes to precache
3a17a7a9
SK
5691 * l2arc_headroom_boost when we find compressed buffers during ARC
5692 * scanning, we multiply headroom by this
5693 * percentage factor for the next scan cycle,
5694 * since more compressed buffers are likely to
5695 * be present
34dc7c2f
BB
5696 * l2arc_feed_secs seconds between L2ARC writing
5697 *
5698 * Tunables may be removed or added as future performance improvements are
5699 * integrated, and also may become zpool properties.
d164b209
BB
5700 *
5701 * There are three key functions that control how the L2ARC warms up:
5702 *
5703 * l2arc_write_eligible() check if a buffer is eligible to cache
5704 * l2arc_write_size() calculate how much to write
5705 * l2arc_write_interval() calculate sleep delay between writes
5706 *
5707 * These three functions determine what to write, how much, and how quickly
5708 * to send writes.
34dc7c2f
BB
5709 */
5710
d164b209 5711static boolean_t
2a432414 5712l2arc_write_eligible(uint64_t spa_guid, arc_buf_hdr_t *hdr)
d164b209
BB
5713{
5714 /*
5715 * A buffer is *not* eligible for the L2ARC if it:
5716 * 1. belongs to a different spa.
428870ff
BB
5717 * 2. is already cached on the L2ARC.
5718 * 3. has an I/O in progress (it may be an incomplete read).
5719 * 4. is flagged not eligible (zfs property).
d164b209 5720 */
b9541d6b 5721 if (hdr->b_spa != spa_guid || HDR_HAS_L2HDR(hdr) ||
2a432414 5722 HDR_IO_IN_PROGRESS(hdr) || !HDR_L2CACHE(hdr))
d164b209
BB
5723 return (B_FALSE);
5724
5725 return (B_TRUE);
5726}
5727
5728static uint64_t
3a17a7a9 5729l2arc_write_size(void)
d164b209
BB
5730{
5731 uint64_t size;
5732
3a17a7a9
SK
5733 /*
5734 * Make sure our globals have meaningful values in case the user
5735 * altered them.
5736 */
5737 size = l2arc_write_max;
5738 if (size == 0) {
5739 cmn_err(CE_NOTE, "Bad value for l2arc_write_max, value must "
5740 "be greater than zero, resetting it to the default (%d)",
5741 L2ARC_WRITE_SIZE);
5742 size = l2arc_write_max = L2ARC_WRITE_SIZE;
5743 }
d164b209
BB
5744
5745 if (arc_warm == B_FALSE)
3a17a7a9 5746 size += l2arc_write_boost;
d164b209
BB
5747
5748 return (size);
5749
5750}
5751
5752static clock_t
5753l2arc_write_interval(clock_t began, uint64_t wanted, uint64_t wrote)
5754{
428870ff 5755 clock_t interval, next, now;
d164b209
BB
5756
5757 /*
5758 * If the ARC lists are busy, increase our write rate; if the
5759 * lists are stale, idle back. This is achieved by checking
5760 * how much we previously wrote - if it was more than half of
5761 * what we wanted, schedule the next write much sooner.
5762 */
5763 if (l2arc_feed_again && wrote > (wanted / 2))
5764 interval = (hz * l2arc_feed_min_ms) / 1000;
5765 else
5766 interval = hz * l2arc_feed_secs;
5767
428870ff
BB
5768 now = ddi_get_lbolt();
5769 next = MAX(now, MIN(now + interval, began + interval));
d164b209
BB
5770
5771 return (next);
5772}
5773
34dc7c2f
BB
5774/*
5775 * Cycle through L2ARC devices. This is how L2ARC load balances.
b128c09f 5776 * If a device is returned, this also returns holding the spa config lock.
34dc7c2f
BB
5777 */
5778static l2arc_dev_t *
5779l2arc_dev_get_next(void)
5780{
b128c09f 5781 l2arc_dev_t *first, *next = NULL;
34dc7c2f 5782
b128c09f
BB
5783 /*
5784 * Lock out the removal of spas (spa_namespace_lock), then removal
5785 * of cache devices (l2arc_dev_mtx). Once a device has been selected,
5786 * both locks will be dropped and a spa config lock held instead.
5787 */
5788 mutex_enter(&spa_namespace_lock);
5789 mutex_enter(&l2arc_dev_mtx);
5790
5791 /* if there are no vdevs, there is nothing to do */
5792 if (l2arc_ndev == 0)
5793 goto out;
5794
5795 first = NULL;
5796 next = l2arc_dev_last;
5797 do {
5798 /* loop around the list looking for a non-faulted vdev */
5799 if (next == NULL) {
34dc7c2f 5800 next = list_head(l2arc_dev_list);
b128c09f
BB
5801 } else {
5802 next = list_next(l2arc_dev_list, next);
5803 if (next == NULL)
5804 next = list_head(l2arc_dev_list);
5805 }
5806
5807 /* if we have come back to the start, bail out */
5808 if (first == NULL)
5809 first = next;
5810 else if (next == first)
5811 break;
5812
5813 } while (vdev_is_dead(next->l2ad_vdev));
5814
5815 /* if we were unable to find any usable vdevs, return NULL */
5816 if (vdev_is_dead(next->l2ad_vdev))
5817 next = NULL;
34dc7c2f
BB
5818
5819 l2arc_dev_last = next;
5820
b128c09f
BB
5821out:
5822 mutex_exit(&l2arc_dev_mtx);
5823
5824 /*
5825 * Grab the config lock to prevent the 'next' device from being
5826 * removed while we are writing to it.
5827 */
5828 if (next != NULL)
5829 spa_config_enter(next->l2ad_spa, SCL_L2ARC, next, RW_READER);
5830 mutex_exit(&spa_namespace_lock);
5831
34dc7c2f
BB
5832 return (next);
5833}
5834
b128c09f
BB
5835/*
5836 * Free buffers that were tagged for destruction.
5837 */
5838static void
0bc8fd78 5839l2arc_do_free_on_write(void)
b128c09f
BB
5840{
5841 list_t *buflist;
5842 l2arc_data_free_t *df, *df_prev;
5843
5844 mutex_enter(&l2arc_free_on_write_mtx);
5845 buflist = l2arc_free_on_write;
5846
5847 for (df = list_tail(buflist); df; df = df_prev) {
5848 df_prev = list_prev(buflist, df);
5849 ASSERT(df->l2df_data != NULL);
5850 ASSERT(df->l2df_func != NULL);
5851 df->l2df_func(df->l2df_data, df->l2df_size);
5852 list_remove(buflist, df);
5853 kmem_free(df, sizeof (l2arc_data_free_t));
5854 }
5855
5856 mutex_exit(&l2arc_free_on_write_mtx);
5857}
5858
34dc7c2f
BB
5859/*
5860 * A write to a cache device has completed. Update all headers to allow
5861 * reads from these buffers to begin.
5862 */
5863static void
5864l2arc_write_done(zio_t *zio)
5865{
5866 l2arc_write_callback_t *cb;
5867 l2arc_dev_t *dev;
5868 list_t *buflist;
2a432414 5869 arc_buf_hdr_t *head, *hdr, *hdr_prev;
34dc7c2f 5870 kmutex_t *hash_lock;
3bec585e 5871 int64_t bytes_dropped = 0;
34dc7c2f
BB
5872
5873 cb = zio->io_private;
5874 ASSERT(cb != NULL);
5875 dev = cb->l2wcb_dev;
5876 ASSERT(dev != NULL);
5877 head = cb->l2wcb_head;
5878 ASSERT(head != NULL);
b9541d6b 5879 buflist = &dev->l2ad_buflist;
34dc7c2f
BB
5880 ASSERT(buflist != NULL);
5881 DTRACE_PROBE2(l2arc__iodone, zio_t *, zio,
5882 l2arc_write_callback_t *, cb);
5883
5884 if (zio->io_error != 0)
5885 ARCSTAT_BUMP(arcstat_l2_writes_error);
5886
34dc7c2f
BB
5887 /*
5888 * All writes completed, or an error was hit.
5889 */
ca0bf58d
PS
5890top:
5891 mutex_enter(&dev->l2ad_mtx);
2a432414
GW
5892 for (hdr = list_prev(buflist, head); hdr; hdr = hdr_prev) {
5893 hdr_prev = list_prev(buflist, hdr);
34dc7c2f 5894
2a432414 5895 hash_lock = HDR_LOCK(hdr);
ca0bf58d
PS
5896
5897 /*
5898 * We cannot use mutex_enter or else we can deadlock
5899 * with l2arc_write_buffers (due to swapping the order
5900 * the hash lock and l2ad_mtx are taken).
5901 */
34dc7c2f
BB
5902 if (!mutex_tryenter(hash_lock)) {
5903 /*
ca0bf58d
PS
5904 * Missed the hash lock. We must retry so we
5905 * don't leave the ARC_FLAG_L2_WRITING bit set.
34dc7c2f 5906 */
ca0bf58d
PS
5907 ARCSTAT_BUMP(arcstat_l2_writes_lock_retry);
5908
5909 /*
5910 * We don't want to rescan the headers we've
5911 * already marked as having been written out, so
5912 * we reinsert the head node so we can pick up
5913 * where we left off.
5914 */
5915 list_remove(buflist, head);
5916 list_insert_after(buflist, hdr, head);
5917
5918 mutex_exit(&dev->l2ad_mtx);
5919
5920 /*
5921 * We wait for the hash lock to become available
5922 * to try and prevent busy waiting, and increase
5923 * the chance we'll be able to acquire the lock
5924 * the next time around.
5925 */
5926 mutex_enter(hash_lock);
5927 mutex_exit(hash_lock);
5928 goto top;
34dc7c2f
BB
5929 }
5930
b9541d6b 5931 /*
ca0bf58d
PS
5932 * We could not have been moved into the arc_l2c_only
5933 * state while in-flight due to our ARC_FLAG_L2_WRITING
5934 * bit being set. Let's just ensure that's being enforced.
5935 */
5936 ASSERT(HDR_HAS_L1HDR(hdr));
5937
5938 /*
5939 * We may have allocated a buffer for L2ARC compression,
5940 * we must release it to avoid leaking this data.
b9541d6b 5941 */
ca0bf58d 5942 l2arc_release_cdata_buf(hdr);
b9541d6b 5943
34dc7c2f
BB
5944 if (zio->io_error != 0) {
5945 /*
b128c09f 5946 * Error - drop L2ARC entry.
34dc7c2f 5947 */
2a432414 5948 list_remove(buflist, hdr);
b9541d6b
CW
5949 hdr->b_flags &= ~ARC_FLAG_HAS_L2HDR;
5950
5951 ARCSTAT_INCR(arcstat_l2_asize, -hdr->b_l2hdr.b_asize);
2a432414 5952 ARCSTAT_INCR(arcstat_l2_size, -hdr->b_size);
d962d5da
PS
5953
5954 bytes_dropped += hdr->b_l2hdr.b_asize;
5955 (void) refcount_remove_many(&dev->l2ad_alloc,
5956 hdr->b_l2hdr.b_asize, hdr);
34dc7c2f
BB
5957 }
5958
5959 /*
ca0bf58d
PS
5960 * Allow ARC to begin reads and ghost list evictions to
5961 * this L2ARC entry.
34dc7c2f 5962 */
2a432414 5963 hdr->b_flags &= ~ARC_FLAG_L2_WRITING;
34dc7c2f
BB
5964
5965 mutex_exit(hash_lock);
5966 }
5967
5968 atomic_inc_64(&l2arc_writes_done);
5969 list_remove(buflist, head);
b9541d6b
CW
5970 ASSERT(!HDR_HAS_L1HDR(head));
5971 kmem_cache_free(hdr_l2only_cache, head);
5972 mutex_exit(&dev->l2ad_mtx);
34dc7c2f 5973
3bec585e
SK
5974 vdev_space_update(dev->l2ad_vdev, -bytes_dropped, 0, 0);
5975
b128c09f 5976 l2arc_do_free_on_write();
34dc7c2f
BB
5977
5978 kmem_free(cb, sizeof (l2arc_write_callback_t));
5979}
5980
5981/*
5982 * A read to a cache device completed. Validate buffer contents before
5983 * handing over to the regular ARC routines.
5984 */
5985static void
5986l2arc_read_done(zio_t *zio)
5987{
5988 l2arc_read_callback_t *cb;
5989 arc_buf_hdr_t *hdr;
5990 arc_buf_t *buf;
34dc7c2f 5991 kmutex_t *hash_lock;
b128c09f
BB
5992 int equal;
5993
5994 ASSERT(zio->io_vd != NULL);
5995 ASSERT(zio->io_flags & ZIO_FLAG_DONT_PROPAGATE);
5996
5997 spa_config_exit(zio->io_spa, SCL_L2ARC, zio->io_vd);
34dc7c2f
BB
5998
5999 cb = zio->io_private;
6000 ASSERT(cb != NULL);
6001 buf = cb->l2rcb_buf;
6002 ASSERT(buf != NULL);
34dc7c2f 6003
428870ff 6004 hash_lock = HDR_LOCK(buf->b_hdr);
34dc7c2f 6005 mutex_enter(hash_lock);
428870ff
BB
6006 hdr = buf->b_hdr;
6007 ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
34dc7c2f 6008
3a17a7a9
SK
6009 /*
6010 * If the buffer was compressed, decompress it first.
6011 */
6012 if (cb->l2rcb_compress != ZIO_COMPRESS_OFF)
6013 l2arc_decompress_zio(zio, hdr, cb->l2rcb_compress);
6014 ASSERT(zio->io_data != NULL);
6015
34dc7c2f
BB
6016 /*
6017 * Check this survived the L2ARC journey.
6018 */
6019 equal = arc_cksum_equal(buf);
6020 if (equal && zio->io_error == 0 && !HDR_L2_EVICTED(hdr)) {
6021 mutex_exit(hash_lock);
6022 zio->io_private = buf;
b128c09f
BB
6023 zio->io_bp_copy = cb->l2rcb_bp; /* XXX fix in L2ARC 2.0 */
6024 zio->io_bp = &zio->io_bp_copy; /* XXX fix in L2ARC 2.0 */
34dc7c2f
BB
6025 arc_read_done(zio);
6026 } else {
6027 mutex_exit(hash_lock);
6028 /*
6029 * Buffer didn't survive caching. Increment stats and
6030 * reissue to the original storage device.
6031 */
b128c09f 6032 if (zio->io_error != 0) {
34dc7c2f 6033 ARCSTAT_BUMP(arcstat_l2_io_error);
b128c09f 6034 } else {
2e528b49 6035 zio->io_error = SET_ERROR(EIO);
b128c09f 6036 }
34dc7c2f
BB
6037 if (!equal)
6038 ARCSTAT_BUMP(arcstat_l2_cksum_bad);
6039
34dc7c2f 6040 /*
b128c09f
BB
6041 * If there's no waiter, issue an async i/o to the primary
6042 * storage now. If there *is* a waiter, the caller must
6043 * issue the i/o in a context where it's OK to block.
34dc7c2f 6044 */
d164b209
BB
6045 if (zio->io_waiter == NULL) {
6046 zio_t *pio = zio_unique_parent(zio);
6047
6048 ASSERT(!pio || pio->io_child_type == ZIO_CHILD_LOGICAL);
6049
6050 zio_nowait(zio_read(pio, cb->l2rcb_spa, &cb->l2rcb_bp,
b128c09f
BB
6051 buf->b_data, zio->io_size, arc_read_done, buf,
6052 zio->io_priority, cb->l2rcb_flags, &cb->l2rcb_zb));
d164b209 6053 }
34dc7c2f
BB
6054 }
6055
6056 kmem_free(cb, sizeof (l2arc_read_callback_t));
6057}
6058
6059/*
6060 * This is the list priority from which the L2ARC will search for pages to
6061 * cache. This is used within loops (0..3) to cycle through lists in the
6062 * desired order. This order can have a significant effect on cache
6063 * performance.
6064 *
6065 * Currently the metadata lists are hit first, MFU then MRU, followed by
6066 * the data lists. This function returns a locked list, and also returns
6067 * the lock pointer.
6068 */
ca0bf58d
PS
6069static multilist_sublist_t *
6070l2arc_sublist_lock(int list_num)
34dc7c2f 6071{
ca0bf58d
PS
6072 multilist_t *ml = NULL;
6073 unsigned int idx;
34dc7c2f
BB
6074
6075 ASSERT(list_num >= 0 && list_num <= 3);
6076
6077 switch (list_num) {
6078 case 0:
ca0bf58d 6079 ml = &arc_mfu->arcs_list[ARC_BUFC_METADATA];
34dc7c2f
BB
6080 break;
6081 case 1:
ca0bf58d 6082 ml = &arc_mru->arcs_list[ARC_BUFC_METADATA];
34dc7c2f
BB
6083 break;
6084 case 2:
ca0bf58d 6085 ml = &arc_mfu->arcs_list[ARC_BUFC_DATA];
34dc7c2f
BB
6086 break;
6087 case 3:
ca0bf58d 6088 ml = &arc_mru->arcs_list[ARC_BUFC_DATA];
34dc7c2f
BB
6089 break;
6090 }
6091
ca0bf58d
PS
6092 /*
6093 * Return a randomly-selected sublist. This is acceptable
6094 * because the caller feeds only a little bit of data for each
6095 * call (8MB). Subsequent calls will result in different
6096 * sublists being selected.
6097 */
6098 idx = multilist_get_random_index(ml);
6099 return (multilist_sublist_lock(ml, idx));
34dc7c2f
BB
6100}
6101
6102/*
6103 * Evict buffers from the device write hand to the distance specified in
6104 * bytes. This distance may span populated buffers, it may span nothing.
6105 * This is clearing a region on the L2ARC device ready for writing.
6106 * If the 'all' boolean is set, every buffer is evicted.
6107 */
6108static void
6109l2arc_evict(l2arc_dev_t *dev, uint64_t distance, boolean_t all)
6110{
6111 list_t *buflist;
2a432414 6112 arc_buf_hdr_t *hdr, *hdr_prev;
34dc7c2f
BB
6113 kmutex_t *hash_lock;
6114 uint64_t taddr;
6115
b9541d6b 6116 buflist = &dev->l2ad_buflist;
34dc7c2f
BB
6117
6118 if (!all && dev->l2ad_first) {
6119 /*
6120 * This is the first sweep through the device. There is
6121 * nothing to evict.
6122 */
6123 return;
6124 }
6125
b128c09f 6126 if (dev->l2ad_hand >= (dev->l2ad_end - (2 * distance))) {
34dc7c2f
BB
6127 /*
6128 * When nearing the end of the device, evict to the end
6129 * before the device write hand jumps to the start.
6130 */
6131 taddr = dev->l2ad_end;
6132 } else {
6133 taddr = dev->l2ad_hand + distance;
6134 }
6135 DTRACE_PROBE4(l2arc__evict, l2arc_dev_t *, dev, list_t *, buflist,
6136 uint64_t, taddr, boolean_t, all);
6137
6138top:
b9541d6b 6139 mutex_enter(&dev->l2ad_mtx);
2a432414
GW
6140 for (hdr = list_tail(buflist); hdr; hdr = hdr_prev) {
6141 hdr_prev = list_prev(buflist, hdr);
34dc7c2f 6142
2a432414 6143 hash_lock = HDR_LOCK(hdr);
ca0bf58d
PS
6144
6145 /*
6146 * We cannot use mutex_enter or else we can deadlock
6147 * with l2arc_write_buffers (due to swapping the order
6148 * the hash lock and l2ad_mtx are taken).
6149 */
34dc7c2f
BB
6150 if (!mutex_tryenter(hash_lock)) {
6151 /*
6152 * Missed the hash lock. Retry.
6153 */
6154 ARCSTAT_BUMP(arcstat_l2_evict_lock_retry);
b9541d6b 6155 mutex_exit(&dev->l2ad_mtx);
34dc7c2f
BB
6156 mutex_enter(hash_lock);
6157 mutex_exit(hash_lock);
6158 goto top;
6159 }
6160
2a432414 6161 if (HDR_L2_WRITE_HEAD(hdr)) {
34dc7c2f
BB
6162 /*
6163 * We hit a write head node. Leave it for
6164 * l2arc_write_done().
6165 */
2a432414 6166 list_remove(buflist, hdr);
34dc7c2f
BB
6167 mutex_exit(hash_lock);
6168 continue;
6169 }
6170
b9541d6b
CW
6171 if (!all && HDR_HAS_L2HDR(hdr) &&
6172 (hdr->b_l2hdr.b_daddr > taddr ||
6173 hdr->b_l2hdr.b_daddr < dev->l2ad_hand)) {
34dc7c2f
BB
6174 /*
6175 * We've evicted to the target address,
6176 * or the end of the device.
6177 */
6178 mutex_exit(hash_lock);
6179 break;
6180 }
6181
b9541d6b
CW
6182 ASSERT(HDR_HAS_L2HDR(hdr));
6183 if (!HDR_HAS_L1HDR(hdr)) {
2a432414 6184 ASSERT(!HDR_L2_READING(hdr));
34dc7c2f
BB
6185 /*
6186 * This doesn't exist in the ARC. Destroy.
6187 * arc_hdr_destroy() will call list_remove()
6188 * and decrement arcstat_l2_size.
6189 */
2a432414
GW
6190 arc_change_state(arc_anon, hdr, hash_lock);
6191 arc_hdr_destroy(hdr);
34dc7c2f 6192 } else {
b9541d6b
CW
6193 ASSERT(hdr->b_l1hdr.b_state != arc_l2c_only);
6194 ARCSTAT_BUMP(arcstat_l2_evict_l1cached);
b128c09f
BB
6195 /*
6196 * Invalidate issued or about to be issued
6197 * reads, since we may be about to write
6198 * over this location.
6199 */
2a432414 6200 if (HDR_L2_READING(hdr)) {
b128c09f 6201 ARCSTAT_BUMP(arcstat_l2_evict_reading);
2a432414 6202 hdr->b_flags |= ARC_FLAG_L2_EVICTED;
b128c09f
BB
6203 }
6204
ca0bf58d
PS
6205 /* Ensure this header has finished being written */
6206 ASSERT(!HDR_L2_WRITING(hdr));
6207 ASSERT3P(hdr->b_l1hdr.b_tmp_cdata, ==, NULL);
d962d5da
PS
6208
6209 arc_hdr_l2hdr_destroy(hdr);
34dc7c2f
BB
6210 }
6211 mutex_exit(hash_lock);
6212 }
b9541d6b 6213 mutex_exit(&dev->l2ad_mtx);
34dc7c2f
BB
6214}
6215
6216/*
6217 * Find and write ARC buffers to the L2ARC device.
6218 *
2a432414 6219 * An ARC_FLAG_L2_WRITING flag is set so that the L2ARC buffers are not valid
34dc7c2f 6220 * for reading until they have completed writing.
3a17a7a9
SK
6221 * The headroom_boost is an in-out parameter used to maintain headroom boost
6222 * state between calls to this function.
6223 *
6224 * Returns the number of bytes actually written (which may be smaller than
6225 * the delta by which the device hand has changed due to alignment).
34dc7c2f 6226 */
d164b209 6227static uint64_t
3a17a7a9
SK
6228l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz,
6229 boolean_t *headroom_boost)
34dc7c2f 6230{
2a432414 6231 arc_buf_hdr_t *hdr, *hdr_prev, *head;
ef56b078
AG
6232 uint64_t write_asize, write_sz, headroom, buf_compress_minsz,
6233 stats_size;
34dc7c2f 6234 void *buf_data;
3a17a7a9 6235 boolean_t full;
34dc7c2f
BB
6236 l2arc_write_callback_t *cb;
6237 zio_t *pio, *wzio;
3541dc6d 6238 uint64_t guid = spa_load_guid(spa);
d6320ddb 6239 int try;
3a17a7a9 6240 const boolean_t do_headroom_boost = *headroom_boost;
34dc7c2f 6241
34dc7c2f
BB
6242 ASSERT(dev->l2ad_vdev != NULL);
6243
3a17a7a9
SK
6244 /* Lower the flag now, we might want to raise it again later. */
6245 *headroom_boost = B_FALSE;
6246
34dc7c2f 6247 pio = NULL;
ef56b078 6248 write_sz = write_asize = 0;
34dc7c2f 6249 full = B_FALSE;
b9541d6b 6250 head = kmem_cache_alloc(hdr_l2only_cache, KM_PUSHPAGE);
2a432414 6251 head->b_flags |= ARC_FLAG_L2_WRITE_HEAD;
b9541d6b 6252 head->b_flags |= ARC_FLAG_HAS_L2HDR;
34dc7c2f 6253
3a17a7a9
SK
6254 /*
6255 * We will want to try to compress buffers that are at least 2x the
6256 * device sector size.
6257 */
6258 buf_compress_minsz = 2 << dev->l2ad_vdev->vdev_ashift;
6259
34dc7c2f
BB
6260 /*
6261 * Copy buffers for L2ARC writing.
6262 */
d6320ddb 6263 for (try = 0; try <= 3; try++) {
ca0bf58d 6264 multilist_sublist_t *mls = l2arc_sublist_lock(try);
3a17a7a9
SK
6265 uint64_t passed_sz = 0;
6266
b128c09f
BB
6267 /*
6268 * L2ARC fast warmup.
6269 *
6270 * Until the ARC is warm and starts to evict, read from the
6271 * head of the ARC lists rather than the tail.
6272 */
b128c09f 6273 if (arc_warm == B_FALSE)
ca0bf58d 6274 hdr = multilist_sublist_head(mls);
b128c09f 6275 else
ca0bf58d 6276 hdr = multilist_sublist_tail(mls);
b128c09f 6277
3a17a7a9
SK
6278 headroom = target_sz * l2arc_headroom;
6279 if (do_headroom_boost)
6280 headroom = (headroom * l2arc_headroom_boost) / 100;
6281
2a432414 6282 for (; hdr; hdr = hdr_prev) {
3a17a7a9
SK
6283 kmutex_t *hash_lock;
6284 uint64_t buf_sz;
ef56b078 6285 uint64_t buf_a_sz;
3a17a7a9 6286
b128c09f 6287 if (arc_warm == B_FALSE)
ca0bf58d 6288 hdr_prev = multilist_sublist_next(mls, hdr);
b128c09f 6289 else
ca0bf58d 6290 hdr_prev = multilist_sublist_prev(mls, hdr);
34dc7c2f 6291
2a432414 6292 hash_lock = HDR_LOCK(hdr);
3a17a7a9 6293 if (!mutex_tryenter(hash_lock)) {
34dc7c2f
BB
6294 /*
6295 * Skip this buffer rather than waiting.
6296 */
6297 continue;
6298 }
6299
2a432414 6300 passed_sz += hdr->b_size;
34dc7c2f
BB
6301 if (passed_sz > headroom) {
6302 /*
6303 * Searched too far.
6304 */
6305 mutex_exit(hash_lock);
6306 break;
6307 }
6308
2a432414 6309 if (!l2arc_write_eligible(guid, hdr)) {
34dc7c2f
BB
6310 mutex_exit(hash_lock);
6311 continue;
6312 }
6313
ef56b078
AG
6314 /*
6315 * Assume that the buffer is not going to be compressed
6316 * and could take more space on disk because of a larger
6317 * disk block size.
6318 */
6319 buf_sz = hdr->b_size;
6320 buf_a_sz = vdev_psize_to_asize(dev->l2ad_vdev, buf_sz);
6321
6322 if ((write_asize + buf_a_sz) > target_sz) {
34dc7c2f
BB
6323 full = B_TRUE;
6324 mutex_exit(hash_lock);
6325 break;
6326 }
6327
34dc7c2f
BB
6328 if (pio == NULL) {
6329 /*
6330 * Insert a dummy header on the buflist so
6331 * l2arc_write_done() can find where the
6332 * write buffers begin without searching.
6333 */
ca0bf58d 6334 mutex_enter(&dev->l2ad_mtx);
b9541d6b 6335 list_insert_head(&dev->l2ad_buflist, head);
ca0bf58d 6336 mutex_exit(&dev->l2ad_mtx);
34dc7c2f 6337
409dc1a5 6338 cb = kmem_alloc(sizeof (l2arc_write_callback_t),
79c76d5b 6339 KM_SLEEP);
34dc7c2f
BB
6340 cb->l2wcb_dev = dev;
6341 cb->l2wcb_head = head;
6342 pio = zio_root(spa, l2arc_write_done, cb,
6343 ZIO_FLAG_CANFAIL);
6344 }
6345
6346 /*
6347 * Create and add a new L2ARC header.
6348 */
b9541d6b 6349 hdr->b_l2hdr.b_dev = dev;
2a432414 6350 hdr->b_flags |= ARC_FLAG_L2_WRITING;
3a17a7a9
SK
6351 /*
6352 * Temporarily stash the data buffer in b_tmp_cdata.
6353 * The subsequent write step will pick it up from
b9541d6b 6354 * there. This is because can't access b_l1hdr.b_buf
3a17a7a9
SK
6355 * without holding the hash_lock, which we in turn
6356 * can't access without holding the ARC list locks
6357 * (which we want to avoid during compression/writing)
6358 */
b9541d6b
CW
6359 HDR_SET_COMPRESS(hdr, ZIO_COMPRESS_OFF);
6360 hdr->b_l2hdr.b_asize = hdr->b_size;
6361 hdr->b_l2hdr.b_hits = 0;
6362 hdr->b_l1hdr.b_tmp_cdata = hdr->b_l1hdr.b_buf->b_data;
3a17a7a9 6363
d962d5da
PS
6364 /*
6365 * Explicitly set the b_daddr field to a known
6366 * value which means "invalid address". This
6367 * enables us to differentiate which stage of
6368 * l2arc_write_buffers() the particular header
6369 * is in (e.g. this loop, or the one below).
6370 * ARC_FLAG_L2_WRITING is not enough to make
6371 * this distinction, and we need to know in
6372 * order to do proper l2arc vdev accounting in
6373 * arc_release() and arc_hdr_destroy().
6374 *
6375 * Note, we can't use a new flag to distinguish
6376 * the two stages because we don't hold the
6377 * header's hash_lock below, in the second stage
6378 * of this function. Thus, we can't simply
6379 * change the b_flags field to denote that the
6380 * IO has been sent. We can change the b_daddr
6381 * field of the L2 portion, though, since we'll
6382 * be holding the l2ad_mtx; which is why we're
6383 * using it to denote the header's state change.
6384 */
6385 hdr->b_l2hdr.b_daddr = L2ARC_ADDR_UNSET;
b9541d6b 6386 hdr->b_flags |= ARC_FLAG_HAS_L2HDR;
3a17a7a9 6387
ca0bf58d 6388 mutex_enter(&dev->l2ad_mtx);
b9541d6b 6389 list_insert_head(&dev->l2ad_buflist, hdr);
ca0bf58d 6390 mutex_exit(&dev->l2ad_mtx);
34dc7c2f
BB
6391
6392 /*
6393 * Compute and store the buffer cksum before
6394 * writing. On debug the cksum is verified first.
6395 */
b9541d6b
CW
6396 arc_cksum_verify(hdr->b_l1hdr.b_buf);
6397 arc_cksum_compute(hdr->b_l1hdr.b_buf, B_TRUE);
34dc7c2f
BB
6398
6399 mutex_exit(hash_lock);
6400
3a17a7a9 6401 write_sz += buf_sz;
ef56b078 6402 write_asize += buf_a_sz;
3a17a7a9
SK
6403 }
6404
ca0bf58d 6405 multilist_sublist_unlock(mls);
3a17a7a9
SK
6406
6407 if (full == B_TRUE)
6408 break;
6409 }
6410
6411 /* No buffers selected for writing? */
6412 if (pio == NULL) {
6413 ASSERT0(write_sz);
b9541d6b
CW
6414 ASSERT(!HDR_HAS_L1HDR(head));
6415 kmem_cache_free(hdr_l2only_cache, head);
3a17a7a9
SK
6416 return (0);
6417 }
6418
ca0bf58d
PS
6419 mutex_enter(&dev->l2ad_mtx);
6420
ef56b078
AG
6421 /*
6422 * Note that elsewhere in this file arcstat_l2_asize
6423 * and the used space on l2ad_vdev are updated using b_asize,
6424 * which is not necessarily rounded up to the device block size.
6425 * Too keep accounting consistent we do the same here as well:
6426 * stats_size accumulates the sum of b_asize of the written buffers,
6427 * while write_asize accumulates the sum of b_asize rounded up
6428 * to the device block size.
6429 * The latter sum is used only to validate the corectness of the code.
6430 */
6431 stats_size = 0;
6432 write_asize = 0;
6433
3a17a7a9
SK
6434 /*
6435 * Now start writing the buffers. We're starting at the write head
6436 * and work backwards, retracing the course of the buffer selector
6437 * loop above.
6438 */
b9541d6b
CW
6439 for (hdr = list_prev(&dev->l2ad_buflist, head); hdr;
6440 hdr = list_prev(&dev->l2ad_buflist, hdr)) {
3a17a7a9
SK
6441 uint64_t buf_sz;
6442
ca0bf58d
PS
6443 /*
6444 * We rely on the L1 portion of the header below, so
6445 * it's invalid for this header to have been evicted out
6446 * of the ghost cache, prior to being written out. The
6447 * ARC_FLAG_L2_WRITING bit ensures this won't happen.
6448 */
6449 ASSERT(HDR_HAS_L1HDR(hdr));
6450
3a17a7a9
SK
6451 /*
6452 * We shouldn't need to lock the buffer here, since we flagged
2a432414
GW
6453 * it as ARC_FLAG_L2_WRITING in the previous step, but we must
6454 * take care to only access its L2 cache parameters. In
b9541d6b 6455 * particular, hdr->l1hdr.b_buf may be invalid by now due to
2a432414 6456 * ARC eviction.
3a17a7a9 6457 */
b9541d6b 6458 hdr->b_l2hdr.b_daddr = dev->l2ad_hand;
3a17a7a9 6459
b9541d6b
CW
6460 if ((!l2arc_nocompress && HDR_L2COMPRESS(hdr)) &&
6461 hdr->b_l2hdr.b_asize >= buf_compress_minsz) {
6462 if (l2arc_compress_buf(hdr)) {
3a17a7a9
SK
6463 /*
6464 * If compression succeeded, enable headroom
6465 * boost on the next scan cycle.
6466 */
6467 *headroom_boost = B_TRUE;
6468 }
6469 }
6470
6471 /*
6472 * Pick up the buffer data we had previously stashed away
6473 * (and now potentially also compressed).
6474 */
b9541d6b
CW
6475 buf_data = hdr->b_l1hdr.b_tmp_cdata;
6476 buf_sz = hdr->b_l2hdr.b_asize;
3a17a7a9 6477
d962d5da
PS
6478 /*
6479 * We need to do this regardless if buf_sz is zero or
6480 * not, otherwise, when this l2hdr is evicted we'll
6481 * remove a reference that was never added.
6482 */
6483 (void) refcount_add_many(&dev->l2ad_alloc, buf_sz, hdr);
6484
3a17a7a9
SK
6485 /* Compression may have squashed the buffer to zero length. */
6486 if (buf_sz != 0) {
ef56b078 6487 uint64_t buf_a_sz;
3a17a7a9 6488
34dc7c2f
BB
6489 wzio = zio_write_phys(pio, dev->l2ad_vdev,
6490 dev->l2ad_hand, buf_sz, buf_data, ZIO_CHECKSUM_OFF,
6491 NULL, NULL, ZIO_PRIORITY_ASYNC_WRITE,
6492 ZIO_FLAG_CANFAIL, B_FALSE);
6493
6494 DTRACE_PROBE2(l2arc__write, vdev_t *, dev->l2ad_vdev,
6495 zio_t *, wzio);
6496 (void) zio_nowait(wzio);
6497
ef56b078 6498 stats_size += buf_sz;
d962d5da 6499
b128c09f
BB
6500 /*
6501 * Keep the clock hand suitably device-aligned.
6502 */
ef56b078
AG
6503 buf_a_sz = vdev_psize_to_asize(dev->l2ad_vdev, buf_sz);
6504 write_asize += buf_a_sz;
6505 dev->l2ad_hand += buf_a_sz;
34dc7c2f 6506 }
34dc7c2f 6507 }
34dc7c2f 6508
b9541d6b 6509 mutex_exit(&dev->l2ad_mtx);
34dc7c2f 6510
3a17a7a9 6511 ASSERT3U(write_asize, <=, target_sz);
34dc7c2f 6512 ARCSTAT_BUMP(arcstat_l2_writes_sent);
3a17a7a9 6513 ARCSTAT_INCR(arcstat_l2_write_bytes, write_asize);
34dc7c2f 6514 ARCSTAT_INCR(arcstat_l2_size, write_sz);
ef56b078
AG
6515 ARCSTAT_INCR(arcstat_l2_asize, stats_size);
6516 vdev_space_update(dev->l2ad_vdev, stats_size, 0, 0);
34dc7c2f
BB
6517
6518 /*
6519 * Bump device hand to the device start if it is approaching the end.
6520 * l2arc_evict() will already have evicted ahead for this case.
6521 */
b128c09f 6522 if (dev->l2ad_hand >= (dev->l2ad_end - target_sz)) {
34dc7c2f 6523 dev->l2ad_hand = dev->l2ad_start;
34dc7c2f
BB
6524 dev->l2ad_first = B_FALSE;
6525 }
6526
d164b209 6527 dev->l2ad_writing = B_TRUE;
34dc7c2f 6528 (void) zio_wait(pio);
d164b209
BB
6529 dev->l2ad_writing = B_FALSE;
6530
3a17a7a9
SK
6531 return (write_asize);
6532}
6533
6534/*
6535 * Compresses an L2ARC buffer.
b9541d6b 6536 * The data to be compressed must be prefilled in l1hdr.b_tmp_cdata and its
3a17a7a9
SK
6537 * size in l2hdr->b_asize. This routine tries to compress the data and
6538 * depending on the compression result there are three possible outcomes:
6539 * *) The buffer was incompressible. The original l2hdr contents were left
6540 * untouched and are ready for writing to an L2 device.
6541 * *) The buffer was all-zeros, so there is no need to write it to an L2
6542 * device. To indicate this situation b_tmp_cdata is NULL'ed, b_asize is
6543 * set to zero and b_compress is set to ZIO_COMPRESS_EMPTY.
6544 * *) Compression succeeded and b_tmp_cdata was replaced with a temporary
6545 * data buffer which holds the compressed data to be written, and b_asize
6546 * tells us how much data there is. b_compress is set to the appropriate
6547 * compression algorithm. Once writing is done, invoke
6548 * l2arc_release_cdata_buf on this l2hdr to free this temporary buffer.
6549 *
6550 * Returns B_TRUE if compression succeeded, or B_FALSE if it didn't (the
6551 * buffer was incompressible).
6552 */
6553static boolean_t
b9541d6b 6554l2arc_compress_buf(arc_buf_hdr_t *hdr)
3a17a7a9
SK
6555{
6556 void *cdata;
9b67f605 6557 size_t csize, len, rounded;
b9541d6b 6558 l2arc_buf_hdr_t *l2hdr;
3a17a7a9 6559
b9541d6b
CW
6560 ASSERT(HDR_HAS_L2HDR(hdr));
6561
6562 l2hdr = &hdr->b_l2hdr;
6563
6564 ASSERT(HDR_HAS_L1HDR(hdr));
6565 ASSERT(HDR_GET_COMPRESS(hdr) == ZIO_COMPRESS_OFF);
6566 ASSERT(hdr->b_l1hdr.b_tmp_cdata != NULL);
3a17a7a9
SK
6567
6568 len = l2hdr->b_asize;
6569 cdata = zio_data_buf_alloc(len);
b9541d6b
CW
6570 ASSERT3P(cdata, !=, NULL);
6571 csize = zio_compress_data(ZIO_COMPRESS_LZ4, hdr->b_l1hdr.b_tmp_cdata,
3a17a7a9
SK
6572 cdata, l2hdr->b_asize);
6573
9b67f605
MA
6574 rounded = P2ROUNDUP(csize, (size_t)SPA_MINBLOCKSIZE);
6575 if (rounded > csize) {
6576 bzero((char *)cdata + csize, rounded - csize);
6577 csize = rounded;
6578 }
6579
3a17a7a9
SK
6580 if (csize == 0) {
6581 /* zero block, indicate that there's nothing to write */
6582 zio_data_buf_free(cdata, len);
b9541d6b 6583 HDR_SET_COMPRESS(hdr, ZIO_COMPRESS_EMPTY);
3a17a7a9 6584 l2hdr->b_asize = 0;
b9541d6b 6585 hdr->b_l1hdr.b_tmp_cdata = NULL;
3a17a7a9
SK
6586 ARCSTAT_BUMP(arcstat_l2_compress_zeros);
6587 return (B_TRUE);
6588 } else if (csize > 0 && csize < len) {
6589 /*
6590 * Compression succeeded, we'll keep the cdata around for
6591 * writing and release it afterwards.
6592 */
b9541d6b 6593 HDR_SET_COMPRESS(hdr, ZIO_COMPRESS_LZ4);
3a17a7a9 6594 l2hdr->b_asize = csize;
b9541d6b 6595 hdr->b_l1hdr.b_tmp_cdata = cdata;
3a17a7a9
SK
6596 ARCSTAT_BUMP(arcstat_l2_compress_successes);
6597 return (B_TRUE);
6598 } else {
6599 /*
6600 * Compression failed, release the compressed buffer.
6601 * l2hdr will be left unmodified.
6602 */
6603 zio_data_buf_free(cdata, len);
6604 ARCSTAT_BUMP(arcstat_l2_compress_failures);
6605 return (B_FALSE);
6606 }
6607}
6608
6609/*
6610 * Decompresses a zio read back from an l2arc device. On success, the
6611 * underlying zio's io_data buffer is overwritten by the uncompressed
6612 * version. On decompression error (corrupt compressed stream), the
6613 * zio->io_error value is set to signal an I/O error.
6614 *
6615 * Please note that the compressed data stream is not checksummed, so
6616 * if the underlying device is experiencing data corruption, we may feed
6617 * corrupt data to the decompressor, so the decompressor needs to be
6618 * able to handle this situation (LZ4 does).
6619 */
6620static void
6621l2arc_decompress_zio(zio_t *zio, arc_buf_hdr_t *hdr, enum zio_compress c)
6622{
6623 uint64_t csize;
6624 void *cdata;
6625
6626 ASSERT(L2ARC_IS_VALID_COMPRESS(c));
6627
6628 if (zio->io_error != 0) {
6629 /*
6630 * An io error has occured, just restore the original io
6631 * size in preparation for a main pool read.
6632 */
6633 zio->io_orig_size = zio->io_size = hdr->b_size;
6634 return;
6635 }
6636
6637 if (c == ZIO_COMPRESS_EMPTY) {
6638 /*
6639 * An empty buffer results in a null zio, which means we
6640 * need to fill its io_data after we're done restoring the
6641 * buffer's contents.
6642 */
b9541d6b
CW
6643 ASSERT(hdr->b_l1hdr.b_buf != NULL);
6644 bzero(hdr->b_l1hdr.b_buf->b_data, hdr->b_size);
6645 zio->io_data = zio->io_orig_data = hdr->b_l1hdr.b_buf->b_data;
3a17a7a9
SK
6646 } else {
6647 ASSERT(zio->io_data != NULL);
6648 /*
6649 * We copy the compressed data from the start of the arc buffer
6650 * (the zio_read will have pulled in only what we need, the
6651 * rest is garbage which we will overwrite at decompression)
6652 * and then decompress back to the ARC data buffer. This way we
6653 * can minimize copying by simply decompressing back over the
6654 * original compressed data (rather than decompressing to an
6655 * aux buffer and then copying back the uncompressed buffer,
6656 * which is likely to be much larger).
6657 */
6658 csize = zio->io_size;
6659 cdata = zio_data_buf_alloc(csize);
6660 bcopy(zio->io_data, cdata, csize);
6661 if (zio_decompress_data(c, cdata, zio->io_data, csize,
6662 hdr->b_size) != 0)
2e528b49 6663 zio->io_error = SET_ERROR(EIO);
3a17a7a9
SK
6664 zio_data_buf_free(cdata, csize);
6665 }
6666
6667 /* Restore the expected uncompressed IO size. */
6668 zio->io_orig_size = zio->io_size = hdr->b_size;
6669}
6670
6671/*
6672 * Releases the temporary b_tmp_cdata buffer in an l2arc header structure.
6673 * This buffer serves as a temporary holder of compressed data while
6674 * the buffer entry is being written to an l2arc device. Once that is
6675 * done, we can dispose of it.
6676 */
6677static void
2a432414 6678l2arc_release_cdata_buf(arc_buf_hdr_t *hdr)
3a17a7a9 6679{
ca0bf58d
PS
6680 enum zio_compress comp = HDR_GET_COMPRESS(hdr);
6681
b9541d6b 6682 ASSERT(HDR_HAS_L1HDR(hdr));
ca0bf58d
PS
6683 ASSERT(comp == ZIO_COMPRESS_OFF || L2ARC_IS_VALID_COMPRESS(comp));
6684
6685 if (comp == ZIO_COMPRESS_OFF) {
6686 /*
6687 * In this case, b_tmp_cdata points to the same buffer
6688 * as the arc_buf_t's b_data field. We don't want to
6689 * free it, since the arc_buf_t will handle that.
6690 */
6691 hdr->b_l1hdr.b_tmp_cdata = NULL;
6692 } else if (comp == ZIO_COMPRESS_EMPTY) {
6693 /*
6694 * In this case, b_tmp_cdata was compressed to an empty
6695 * buffer, thus there's nothing to free and b_tmp_cdata
6696 * should have been set to NULL in l2arc_write_buffers().
6697 */
6698 ASSERT3P(hdr->b_l1hdr.b_tmp_cdata, ==, NULL);
6699 } else {
3a17a7a9
SK
6700 /*
6701 * If the data was compressed, then we've allocated a
6702 * temporary buffer for it, so now we need to release it.
6703 */
b9541d6b
CW
6704 ASSERT(hdr->b_l1hdr.b_tmp_cdata != NULL);
6705 zio_data_buf_free(hdr->b_l1hdr.b_tmp_cdata,
6706 hdr->b_size);
ca0bf58d 6707 hdr->b_l1hdr.b_tmp_cdata = NULL;
3a17a7a9 6708 }
ca0bf58d 6709
34dc7c2f
BB
6710}
6711
6712/*
6713 * This thread feeds the L2ARC at regular intervals. This is the beating
6714 * heart of the L2ARC.
6715 */
6716static void
6717l2arc_feed_thread(void)
6718{
6719 callb_cpr_t cpr;
6720 l2arc_dev_t *dev;
6721 spa_t *spa;
d164b209 6722 uint64_t size, wrote;
428870ff 6723 clock_t begin, next = ddi_get_lbolt();
3a17a7a9 6724 boolean_t headroom_boost = B_FALSE;
40d06e3c 6725 fstrans_cookie_t cookie;
34dc7c2f
BB
6726
6727 CALLB_CPR_INIT(&cpr, &l2arc_feed_thr_lock, callb_generic_cpr, FTAG);
6728
6729 mutex_enter(&l2arc_feed_thr_lock);
6730
40d06e3c 6731 cookie = spl_fstrans_mark();
34dc7c2f 6732 while (l2arc_thread_exit == 0) {
34dc7c2f 6733 CALLB_CPR_SAFE_BEGIN(&cpr);
b64ccd6c 6734 (void) cv_timedwait_sig(&l2arc_feed_thr_cv,
5b63b3eb 6735 &l2arc_feed_thr_lock, next);
34dc7c2f 6736 CALLB_CPR_SAFE_END(&cpr, &l2arc_feed_thr_lock);
428870ff 6737 next = ddi_get_lbolt() + hz;
34dc7c2f
BB
6738
6739 /*
b128c09f 6740 * Quick check for L2ARC devices.
34dc7c2f
BB
6741 */
6742 mutex_enter(&l2arc_dev_mtx);
6743 if (l2arc_ndev == 0) {
6744 mutex_exit(&l2arc_dev_mtx);
6745 continue;
6746 }
b128c09f 6747 mutex_exit(&l2arc_dev_mtx);
428870ff 6748 begin = ddi_get_lbolt();
34dc7c2f
BB
6749
6750 /*
b128c09f
BB
6751 * This selects the next l2arc device to write to, and in
6752 * doing so the next spa to feed from: dev->l2ad_spa. This
6753 * will return NULL if there are now no l2arc devices or if
6754 * they are all faulted.
6755 *
6756 * If a device is returned, its spa's config lock is also
6757 * held to prevent device removal. l2arc_dev_get_next()
6758 * will grab and release l2arc_dev_mtx.
34dc7c2f 6759 */
b128c09f 6760 if ((dev = l2arc_dev_get_next()) == NULL)
34dc7c2f 6761 continue;
b128c09f
BB
6762
6763 spa = dev->l2ad_spa;
6764 ASSERT(spa != NULL);
34dc7c2f 6765
572e2857
BB
6766 /*
6767 * If the pool is read-only then force the feed thread to
6768 * sleep a little longer.
6769 */
6770 if (!spa_writeable(spa)) {
6771 next = ddi_get_lbolt() + 5 * l2arc_feed_secs * hz;
6772 spa_config_exit(spa, SCL_L2ARC, dev);
6773 continue;
6774 }
6775
34dc7c2f 6776 /*
b128c09f 6777 * Avoid contributing to memory pressure.
34dc7c2f 6778 */
ca67b33a 6779 if (arc_reclaim_needed()) {
b128c09f
BB
6780 ARCSTAT_BUMP(arcstat_l2_abort_lowmem);
6781 spa_config_exit(spa, SCL_L2ARC, dev);
34dc7c2f
BB
6782 continue;
6783 }
b128c09f 6784
34dc7c2f
BB
6785 ARCSTAT_BUMP(arcstat_l2_feeds);
6786
3a17a7a9 6787 size = l2arc_write_size();
b128c09f 6788
34dc7c2f
BB
6789 /*
6790 * Evict L2ARC buffers that will be overwritten.
6791 */
b128c09f 6792 l2arc_evict(dev, size, B_FALSE);
34dc7c2f
BB
6793
6794 /*
6795 * Write ARC buffers.
6796 */
3a17a7a9 6797 wrote = l2arc_write_buffers(spa, dev, size, &headroom_boost);
d164b209
BB
6798
6799 /*
6800 * Calculate interval between writes.
6801 */
6802 next = l2arc_write_interval(begin, size, wrote);
b128c09f 6803 spa_config_exit(spa, SCL_L2ARC, dev);
34dc7c2f 6804 }
40d06e3c 6805 spl_fstrans_unmark(cookie);
34dc7c2f
BB
6806
6807 l2arc_thread_exit = 0;
6808 cv_broadcast(&l2arc_feed_thr_cv);
6809 CALLB_CPR_EXIT(&cpr); /* drops l2arc_feed_thr_lock */
6810 thread_exit();
6811}
6812
b128c09f
BB
6813boolean_t
6814l2arc_vdev_present(vdev_t *vd)
6815{
6816 l2arc_dev_t *dev;
6817
6818 mutex_enter(&l2arc_dev_mtx);
6819 for (dev = list_head(l2arc_dev_list); dev != NULL;
6820 dev = list_next(l2arc_dev_list, dev)) {
6821 if (dev->l2ad_vdev == vd)
6822 break;
6823 }
6824 mutex_exit(&l2arc_dev_mtx);
6825
6826 return (dev != NULL);
6827}
6828
34dc7c2f
BB
6829/*
6830 * Add a vdev for use by the L2ARC. By this point the spa has already
6831 * validated the vdev and opened it.
6832 */
6833void
9babb374 6834l2arc_add_vdev(spa_t *spa, vdev_t *vd)
34dc7c2f
BB
6835{
6836 l2arc_dev_t *adddev;
6837
b128c09f
BB
6838 ASSERT(!l2arc_vdev_present(vd));
6839
34dc7c2f
BB
6840 /*
6841 * Create a new l2arc device entry.
6842 */
6843 adddev = kmem_zalloc(sizeof (l2arc_dev_t), KM_SLEEP);
6844 adddev->l2ad_spa = spa;
6845 adddev->l2ad_vdev = vd;
9babb374
BB
6846 adddev->l2ad_start = VDEV_LABEL_START_SIZE;
6847 adddev->l2ad_end = VDEV_LABEL_START_SIZE + vdev_get_min_asize(vd);
34dc7c2f 6848 adddev->l2ad_hand = adddev->l2ad_start;
34dc7c2f 6849 adddev->l2ad_first = B_TRUE;
d164b209 6850 adddev->l2ad_writing = B_FALSE;
98f72a53 6851 list_link_init(&adddev->l2ad_node);
34dc7c2f 6852
b9541d6b 6853 mutex_init(&adddev->l2ad_mtx, NULL, MUTEX_DEFAULT, NULL);
34dc7c2f
BB
6854 /*
6855 * This is a list of all ARC buffers that are still valid on the
6856 * device.
6857 */
b9541d6b
CW
6858 list_create(&adddev->l2ad_buflist, sizeof (arc_buf_hdr_t),
6859 offsetof(arc_buf_hdr_t, b_l2hdr.b_l2node));
34dc7c2f 6860
428870ff 6861 vdev_space_update(vd, 0, 0, adddev->l2ad_end - adddev->l2ad_hand);
d962d5da 6862 refcount_create(&adddev->l2ad_alloc);
34dc7c2f
BB
6863
6864 /*
6865 * Add device to global list
6866 */
6867 mutex_enter(&l2arc_dev_mtx);
6868 list_insert_head(l2arc_dev_list, adddev);
6869 atomic_inc_64(&l2arc_ndev);
6870 mutex_exit(&l2arc_dev_mtx);
6871}
6872
6873/*
6874 * Remove a vdev from the L2ARC.
6875 */
6876void
6877l2arc_remove_vdev(vdev_t *vd)
6878{
6879 l2arc_dev_t *dev, *nextdev, *remdev = NULL;
6880
34dc7c2f
BB
6881 /*
6882 * Find the device by vdev
6883 */
6884 mutex_enter(&l2arc_dev_mtx);
6885 for (dev = list_head(l2arc_dev_list); dev; dev = nextdev) {
6886 nextdev = list_next(l2arc_dev_list, dev);
6887 if (vd == dev->l2ad_vdev) {
6888 remdev = dev;
6889 break;
6890 }
6891 }
6892 ASSERT(remdev != NULL);
6893
6894 /*
6895 * Remove device from global list
6896 */
6897 list_remove(l2arc_dev_list, remdev);
6898 l2arc_dev_last = NULL; /* may have been invalidated */
b128c09f
BB
6899 atomic_dec_64(&l2arc_ndev);
6900 mutex_exit(&l2arc_dev_mtx);
34dc7c2f
BB
6901
6902 /*
6903 * Clear all buflists and ARC references. L2ARC device flush.
6904 */
6905 l2arc_evict(remdev, 0, B_TRUE);
b9541d6b
CW
6906 list_destroy(&remdev->l2ad_buflist);
6907 mutex_destroy(&remdev->l2ad_mtx);
d962d5da 6908 refcount_destroy(&remdev->l2ad_alloc);
34dc7c2f 6909 kmem_free(remdev, sizeof (l2arc_dev_t));
34dc7c2f
BB
6910}
6911
6912void
b128c09f 6913l2arc_init(void)
34dc7c2f
BB
6914{
6915 l2arc_thread_exit = 0;
6916 l2arc_ndev = 0;
6917 l2arc_writes_sent = 0;
6918 l2arc_writes_done = 0;
6919
6920 mutex_init(&l2arc_feed_thr_lock, NULL, MUTEX_DEFAULT, NULL);
6921 cv_init(&l2arc_feed_thr_cv, NULL, CV_DEFAULT, NULL);
6922 mutex_init(&l2arc_dev_mtx, NULL, MUTEX_DEFAULT, NULL);
34dc7c2f
BB
6923 mutex_init(&l2arc_free_on_write_mtx, NULL, MUTEX_DEFAULT, NULL);
6924
6925 l2arc_dev_list = &L2ARC_dev_list;
6926 l2arc_free_on_write = &L2ARC_free_on_write;
6927 list_create(l2arc_dev_list, sizeof (l2arc_dev_t),
6928 offsetof(l2arc_dev_t, l2ad_node));
6929 list_create(l2arc_free_on_write, sizeof (l2arc_data_free_t),
6930 offsetof(l2arc_data_free_t, l2df_list_node));
34dc7c2f
BB
6931}
6932
6933void
b128c09f 6934l2arc_fini(void)
34dc7c2f 6935{
b128c09f
BB
6936 /*
6937 * This is called from dmu_fini(), which is called from spa_fini();
6938 * Because of this, we can assume that all l2arc devices have
6939 * already been removed when the pools themselves were removed.
6940 */
6941
6942 l2arc_do_free_on_write();
34dc7c2f
BB
6943
6944 mutex_destroy(&l2arc_feed_thr_lock);
6945 cv_destroy(&l2arc_feed_thr_cv);
6946 mutex_destroy(&l2arc_dev_mtx);
34dc7c2f
BB
6947 mutex_destroy(&l2arc_free_on_write_mtx);
6948
6949 list_destroy(l2arc_dev_list);
6950 list_destroy(l2arc_free_on_write);
6951}
b128c09f
BB
6952
6953void
6954l2arc_start(void)
6955{
fb5f0bc8 6956 if (!(spa_mode_global & FWRITE))
b128c09f
BB
6957 return;
6958
6959 (void) thread_create(NULL, 0, l2arc_feed_thread, NULL, 0, &p0,
6960 TS_RUN, minclsyspri);
6961}
6962
6963void
6964l2arc_stop(void)
6965{
fb5f0bc8 6966 if (!(spa_mode_global & FWRITE))
b128c09f
BB
6967 return;
6968
6969 mutex_enter(&l2arc_feed_thr_lock);
6970 cv_signal(&l2arc_feed_thr_cv); /* kick thread out of startup */
6971 l2arc_thread_exit = 1;
6972 while (l2arc_thread_exit != 0)
6973 cv_wait(&l2arc_feed_thr_cv, &l2arc_feed_thr_lock);
6974 mutex_exit(&l2arc_feed_thr_lock);
6975}
c28b2279
BB
6976
6977#if defined(_KERNEL) && defined(HAVE_SPL)
0f699108
AZ
6978EXPORT_SYMBOL(arc_buf_size);
6979EXPORT_SYMBOL(arc_write);
c28b2279
BB
6980EXPORT_SYMBOL(arc_read);
6981EXPORT_SYMBOL(arc_buf_remove_ref);
e0b0ca98 6982EXPORT_SYMBOL(arc_buf_info);
c28b2279 6983EXPORT_SYMBOL(arc_getbuf_func);
ab26409d
BB
6984EXPORT_SYMBOL(arc_add_prune_callback);
6985EXPORT_SYMBOL(arc_remove_prune_callback);
c28b2279 6986
bce45ec9 6987module_param(zfs_arc_min, ulong, 0644);
c409e464 6988MODULE_PARM_DESC(zfs_arc_min, "Min arc size");
c28b2279 6989
bce45ec9 6990module_param(zfs_arc_max, ulong, 0644);
c409e464 6991MODULE_PARM_DESC(zfs_arc_max, "Max arc size");
c28b2279 6992
bce45ec9 6993module_param(zfs_arc_meta_limit, ulong, 0644);
c28b2279 6994MODULE_PARM_DESC(zfs_arc_meta_limit, "Meta limit for arc size");
6a8f9b6b 6995
ca0bf58d
PS
6996module_param(zfs_arc_meta_min, ulong, 0644);
6997MODULE_PARM_DESC(zfs_arc_meta_min, "Min arc metadata");
6998
bce45ec9 6999module_param(zfs_arc_meta_prune, int, 0644);
2cbb06b5 7000MODULE_PARM_DESC(zfs_arc_meta_prune, "Meta objects to scan for prune");
c409e464 7001
ca67b33a 7002module_param(zfs_arc_meta_adjust_restarts, int, 0644);
bc888666
BB
7003MODULE_PARM_DESC(zfs_arc_meta_adjust_restarts,
7004 "Limit number of restarts in arc_adjust_meta");
7005
f6046738
BB
7006module_param(zfs_arc_meta_strategy, int, 0644);
7007MODULE_PARM_DESC(zfs_arc_meta_strategy, "Meta reclaim strategy");
7008
bce45ec9 7009module_param(zfs_arc_grow_retry, int, 0644);
c409e464
BB
7010MODULE_PARM_DESC(zfs_arc_grow_retry, "Seconds before growing arc size");
7011
89c8cac4
PS
7012module_param(zfs_arc_p_aggressive_disable, int, 0644);
7013MODULE_PARM_DESC(zfs_arc_p_aggressive_disable, "disable aggressive arc_p grow");
7014
62422785
PS
7015module_param(zfs_arc_p_dampener_disable, int, 0644);
7016MODULE_PARM_DESC(zfs_arc_p_dampener_disable, "disable arc_p adapt dampener");
7017
bce45ec9 7018module_param(zfs_arc_shrink_shift, int, 0644);
c409e464
BB
7019MODULE_PARM_DESC(zfs_arc_shrink_shift, "log2(fraction of arc to reclaim)");
7020
728d6ae9
BB
7021module_param(zfs_arc_p_min_shift, int, 0644);
7022MODULE_PARM_DESC(zfs_arc_p_min_shift, "arc_c shift to calc min/max arc_p");
7023
1f7c30df
BB
7024module_param(zfs_disable_dup_eviction, int, 0644);
7025MODULE_PARM_DESC(zfs_disable_dup_eviction, "disable duplicate buffer eviction");
7026
49ddb315
MA
7027module_param(zfs_arc_average_blocksize, int, 0444);
7028MODULE_PARM_DESC(zfs_arc_average_blocksize, "Target average block size");
7029
0c5493d4
BB
7030module_param(zfs_arc_memory_throttle_disable, int, 0644);
7031MODULE_PARM_DESC(zfs_arc_memory_throttle_disable, "disable memory throttle");
7032
bce45ec9
BB
7033module_param(zfs_arc_min_prefetch_lifespan, int, 0644);
7034MODULE_PARM_DESC(zfs_arc_min_prefetch_lifespan, "Min life of prefetch block");
7035
ca0bf58d
PS
7036module_param(zfs_arc_num_sublists_per_state, int, 0644);
7037MODULE_PARM_DESC(zfs_arc_num_sublists_per_state,
7038 "Number of sublists used in each of the ARC state lists");
7039
bce45ec9 7040module_param(l2arc_write_max, ulong, 0644);
abd8610c
BB
7041MODULE_PARM_DESC(l2arc_write_max, "Max write bytes per interval");
7042
bce45ec9 7043module_param(l2arc_write_boost, ulong, 0644);
abd8610c
BB
7044MODULE_PARM_DESC(l2arc_write_boost, "Extra write bytes during device warmup");
7045
bce45ec9 7046module_param(l2arc_headroom, ulong, 0644);
abd8610c
BB
7047MODULE_PARM_DESC(l2arc_headroom, "Number of max device writes to precache");
7048
3a17a7a9
SK
7049module_param(l2arc_headroom_boost, ulong, 0644);
7050MODULE_PARM_DESC(l2arc_headroom_boost, "Compressed l2arc_headroom multiplier");
7051
bce45ec9 7052module_param(l2arc_feed_secs, ulong, 0644);
abd8610c
BB
7053MODULE_PARM_DESC(l2arc_feed_secs, "Seconds between L2ARC writing");
7054
bce45ec9 7055module_param(l2arc_feed_min_ms, ulong, 0644);
abd8610c
BB
7056MODULE_PARM_DESC(l2arc_feed_min_ms, "Min feed interval in milliseconds");
7057
bce45ec9 7058module_param(l2arc_noprefetch, int, 0644);
abd8610c
BB
7059MODULE_PARM_DESC(l2arc_noprefetch, "Skip caching prefetched buffers");
7060
3a17a7a9
SK
7061module_param(l2arc_nocompress, int, 0644);
7062MODULE_PARM_DESC(l2arc_nocompress, "Skip compressing L2ARC buffers");
7063
bce45ec9 7064module_param(l2arc_feed_again, int, 0644);
abd8610c
BB
7065MODULE_PARM_DESC(l2arc_feed_again, "Turbo L2ARC warmup");
7066
bce45ec9 7067module_param(l2arc_norw, int, 0644);
abd8610c
BB
7068MODULE_PARM_DESC(l2arc_norw, "No reads during writes");
7069
c28b2279 7070#endif