]> git.proxmox.com Git - mirror_zfs.git/blame - module/zfs/arc.c
Illumos 5817 - change type of arcs_size from uint64_t to refcount_t
[mirror_zfs.git] / module / zfs / arc.c
CommitLineData
34dc7c2f
BB
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21/*
428870ff 22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
36da08ef
PS
23 * Copyright (c) 2012, Joyent, Inc. All rights reserved.
24 * Copyright (c) 2011, 2015 by Delphix. All rights reserved.
25 * Copyright (c) 2014 by Saso Kiselkov. All rights reserved.
3bec585e 26 * Copyright 2014 Nexenta Systems, Inc. All rights reserved.
34dc7c2f
BB
27 */
28
34dc7c2f
BB
29/*
30 * DVA-based Adjustable Replacement Cache
31 *
32 * While much of the theory of operation used here is
33 * based on the self-tuning, low overhead replacement cache
34 * presented by Megiddo and Modha at FAST 2003, there are some
35 * significant differences:
36 *
37 * 1. The Megiddo and Modha model assumes any page is evictable.
38 * Pages in its cache cannot be "locked" into memory. This makes
39 * the eviction algorithm simple: evict the last page in the list.
40 * This also make the performance characteristics easy to reason
41 * about. Our cache is not so simple. At any given moment, some
42 * subset of the blocks in the cache are un-evictable because we
43 * have handed out a reference to them. Blocks are only evictable
44 * when there are no external references active. This makes
45 * eviction far more problematic: we choose to evict the evictable
46 * blocks that are the "lowest" in the list.
47 *
48 * There are times when it is not possible to evict the requested
49 * space. In these circumstances we are unable to adjust the cache
50 * size. To prevent the cache growing unbounded at these times we
51 * implement a "cache throttle" that slows the flow of new data
52 * into the cache until we can make space available.
53 *
54 * 2. The Megiddo and Modha model assumes a fixed cache size.
55 * Pages are evicted when the cache is full and there is a cache
56 * miss. Our model has a variable sized cache. It grows with
57 * high use, but also tries to react to memory pressure from the
58 * operating system: decreasing its size when system memory is
59 * tight.
60 *
61 * 3. The Megiddo and Modha model assumes a fixed page size. All
d3cc8b15 62 * elements of the cache are therefore exactly the same size. So
34dc7c2f
BB
63 * when adjusting the cache size following a cache miss, its simply
64 * a matter of choosing a single page to evict. In our model, we
65 * have variable sized cache blocks (rangeing from 512 bytes to
d3cc8b15 66 * 128K bytes). We therefore choose a set of blocks to evict to make
34dc7c2f
BB
67 * space for a cache miss that approximates as closely as possible
68 * the space used by the new block.
69 *
70 * See also: "ARC: A Self-Tuning, Low Overhead Replacement Cache"
71 * by N. Megiddo & D. Modha, FAST 2003
72 */
73
74/*
75 * The locking model:
76 *
77 * A new reference to a cache buffer can be obtained in two
78 * ways: 1) via a hash table lookup using the DVA as a key,
79 * or 2) via one of the ARC lists. The arc_read() interface
80 * uses method 1, while the internal arc algorithms for
d3cc8b15 81 * adjusting the cache use method 2. We therefore provide two
34dc7c2f
BB
82 * types of locks: 1) the hash table lock array, and 2) the
83 * arc list locks.
84 *
5c839890
BC
85 * Buffers do not have their own mutexes, rather they rely on the
86 * hash table mutexes for the bulk of their protection (i.e. most
87 * fields in the arc_buf_hdr_t are protected by these mutexes).
34dc7c2f
BB
88 *
89 * buf_hash_find() returns the appropriate mutex (held) when it
90 * locates the requested buffer in the hash table. It returns
91 * NULL for the mutex if the buffer was not in the table.
92 *
93 * buf_hash_remove() expects the appropriate hash mutex to be
94 * already held before it is invoked.
95 *
96 * Each arc state also has a mutex which is used to protect the
97 * buffer list associated with the state. When attempting to
98 * obtain a hash table lock while holding an arc list lock you
99 * must use: mutex_tryenter() to avoid deadlock. Also note that
100 * the active state mutex must be held before the ghost state mutex.
101 *
102 * Arc buffers may have an associated eviction callback function.
103 * This function will be invoked prior to removing the buffer (e.g.
104 * in arc_do_user_evicts()). Note however that the data associated
105 * with the buffer may be evicted prior to the callback. The callback
106 * must be made with *no locks held* (to prevent deadlock). Additionally,
107 * the users of callbacks must ensure that their private data is
bd089c54 108 * protected from simultaneous callbacks from arc_clear_callback()
34dc7c2f
BB
109 * and arc_do_user_evicts().
110 *
ab26409d
BB
111 * It as also possible to register a callback which is run when the
112 * arc_meta_limit is reached and no buffers can be safely evicted. In
113 * this case the arc user should drop a reference on some arc buffers so
114 * they can be reclaimed and the arc_meta_limit honored. For example,
115 * when using the ZPL each dentry holds a references on a znode. These
116 * dentries must be pruned before the arc buffer holding the znode can
117 * be safely evicted.
118 *
34dc7c2f
BB
119 * Note that the majority of the performance stats are manipulated
120 * with atomic operations.
121 *
b9541d6b 122 * The L2ARC uses the l2ad_mtx on each vdev for the following:
34dc7c2f
BB
123 *
124 * - L2ARC buflist creation
125 * - L2ARC buflist eviction
126 * - L2ARC write completion, which walks L2ARC buflists
127 * - ARC header destruction, as it removes from L2ARC buflists
128 * - ARC header release, as it removes from L2ARC buflists
129 */
130
131#include <sys/spa.h>
132#include <sys/zio.h>
3a17a7a9 133#include <sys/zio_compress.h>
34dc7c2f
BB
134#include <sys/zfs_context.h>
135#include <sys/arc.h>
36da08ef 136#include <sys/refcount.h>
b128c09f 137#include <sys/vdev.h>
9babb374 138#include <sys/vdev_impl.h>
e8b96c60 139#include <sys/dsl_pool.h>
ca0bf58d 140#include <sys/multilist.h>
34dc7c2f
BB
141#ifdef _KERNEL
142#include <sys/vmsystm.h>
143#include <vm/anon.h>
144#include <sys/fs/swapnode.h>
ab26409d 145#include <sys/zpl.h>
aaed7c40 146#include <linux/mm_compat.h>
34dc7c2f
BB
147#endif
148#include <sys/callb.h>
149#include <sys/kstat.h>
570827e1 150#include <sys/dmu_tx.h>
428870ff 151#include <zfs_fletcher.h>
59ec819a 152#include <sys/arc_impl.h>
49ee64e5 153#include <sys/trace_arc.h>
34dc7c2f 154
498877ba
MA
155#ifndef _KERNEL
156/* set with ZFS_DEBUG=watch, to enable watchpoints on frozen buffers */
157boolean_t arc_watch = B_FALSE;
158#endif
159
ca0bf58d
PS
160static kmutex_t arc_reclaim_lock;
161static kcondvar_t arc_reclaim_thread_cv;
162static boolean_t arc_reclaim_thread_exit;
163static kcondvar_t arc_reclaim_waiters_cv;
164
165static kmutex_t arc_user_evicts_lock;
166static kcondvar_t arc_user_evicts_cv;
167static boolean_t arc_user_evicts_thread_exit;
34dc7c2f 168
e8b96c60 169/*
ca0bf58d
PS
170 * The number of headers to evict in arc_evict_state_impl() before
171 * dropping the sublist lock and evicting from another sublist. A lower
172 * value means we're more likely to evict the "correct" header (i.e. the
173 * oldest header in the arc state), but comes with higher overhead
174 * (i.e. more invocations of arc_evict_state_impl()).
175 */
176int zfs_arc_evict_batch_limit = 10;
177
178/*
179 * The number of sublists used for each of the arc state lists. If this
180 * is not set to a suitable value by the user, it will be configured to
181 * the number of CPUs on the system in arc_init().
e8b96c60 182 */
ca0bf58d 183int zfs_arc_num_sublists_per_state = 0;
e8b96c60 184
34dc7c2f 185/* number of seconds before growing cache again */
ca67b33a 186static int arc_grow_retry = 5;
34dc7c2f 187
ca0bf58d 188/* shift of arc_c for calculating overflow limit in arc_get_data_buf */
ca67b33a 189int zfs_arc_overflow_shift = 8;
62422785 190
d164b209 191/* log2(fraction of arc to reclaim) */
ca67b33a 192static int arc_shrink_shift = 7;
d164b209 193
34dc7c2f 194/*
ca67b33a
MA
195 * log2(fraction of ARC which must be free to allow growing).
196 * I.e. If there is less than arc_c >> arc_no_grow_shift free memory,
197 * when reading a new block into the ARC, we will evict an equal-sized block
198 * from the ARC.
199 *
200 * This must be less than arc_shrink_shift, so that when we shrink the ARC,
201 * we will still not allow it to grow.
34dc7c2f 202 */
ca67b33a 203int arc_no_grow_shift = 5;
bce45ec9 204
49ddb315 205
ca0bf58d
PS
206/*
207 * minimum lifespan of a prefetch block in clock ticks
208 * (initialized in arc_init())
209 */
ca67b33a 210static int arc_min_prefetch_lifespan;
ca0bf58d 211
e8b96c60
MA
212/*
213 * If this percent of memory is free, don't throttle.
214 */
215int arc_lotsfree_percent = 10;
216
34dc7c2f
BB
217static int arc_dead;
218
b128c09f
BB
219/*
220 * The arc has filled available memory and has now warmed up.
221 */
222static boolean_t arc_warm;
223
34dc7c2f
BB
224/*
225 * These tunables are for performance analysis.
226 */
c28b2279
BB
227unsigned long zfs_arc_max = 0;
228unsigned long zfs_arc_min = 0;
229unsigned long zfs_arc_meta_limit = 0;
ca0bf58d 230unsigned long zfs_arc_meta_min = 0;
ca67b33a
MA
231int zfs_arc_grow_retry = 0;
232int zfs_arc_shrink_shift = 0;
233int zfs_disable_dup_eviction = 0;
234int zfs_arc_average_blocksize = 8 * 1024; /* 8KB */
34dc7c2f 235
bc888666 236/*
ca67b33a 237 * These tunables are Linux specific
bc888666 238 */
ca67b33a
MA
239int zfs_arc_memory_throttle_disable = 1;
240int zfs_arc_min_prefetch_lifespan = 0;
241int zfs_arc_p_aggressive_disable = 1;
242int zfs_arc_p_dampener_disable = 1;
243int zfs_arc_meta_prune = 10000;
244int zfs_arc_meta_strategy = ARC_STRATEGY_META_BALANCED;
245int zfs_arc_meta_adjust_restarts = 4096;
bc888666 246
34dc7c2f
BB
247/* The 6 states: */
248static arc_state_t ARC_anon;
249static arc_state_t ARC_mru;
250static arc_state_t ARC_mru_ghost;
251static arc_state_t ARC_mfu;
252static arc_state_t ARC_mfu_ghost;
253static arc_state_t ARC_l2c_only;
254
255typedef struct arc_stats {
256 kstat_named_t arcstat_hits;
257 kstat_named_t arcstat_misses;
258 kstat_named_t arcstat_demand_data_hits;
259 kstat_named_t arcstat_demand_data_misses;
260 kstat_named_t arcstat_demand_metadata_hits;
261 kstat_named_t arcstat_demand_metadata_misses;
262 kstat_named_t arcstat_prefetch_data_hits;
263 kstat_named_t arcstat_prefetch_data_misses;
264 kstat_named_t arcstat_prefetch_metadata_hits;
265 kstat_named_t arcstat_prefetch_metadata_misses;
266 kstat_named_t arcstat_mru_hits;
267 kstat_named_t arcstat_mru_ghost_hits;
268 kstat_named_t arcstat_mfu_hits;
269 kstat_named_t arcstat_mfu_ghost_hits;
270 kstat_named_t arcstat_deleted;
e49f1e20
WA
271 /*
272 * Number of buffers that could not be evicted because the hash lock
273 * was held by another thread. The lock may not necessarily be held
274 * by something using the same buffer, since hash locks are shared
275 * by multiple buffers.
276 */
34dc7c2f 277 kstat_named_t arcstat_mutex_miss;
e49f1e20
WA
278 /*
279 * Number of buffers skipped because they have I/O in progress, are
280 * indrect prefetch buffers that have not lived long enough, or are
281 * not from the spa we're trying to evict from.
282 */
34dc7c2f 283 kstat_named_t arcstat_evict_skip;
ca0bf58d
PS
284 /*
285 * Number of times arc_evict_state() was unable to evict enough
286 * buffers to reach its target amount.
287 */
288 kstat_named_t arcstat_evict_not_enough;
428870ff
BB
289 kstat_named_t arcstat_evict_l2_cached;
290 kstat_named_t arcstat_evict_l2_eligible;
291 kstat_named_t arcstat_evict_l2_ineligible;
ca0bf58d 292 kstat_named_t arcstat_evict_l2_skip;
34dc7c2f
BB
293 kstat_named_t arcstat_hash_elements;
294 kstat_named_t arcstat_hash_elements_max;
295 kstat_named_t arcstat_hash_collisions;
296 kstat_named_t arcstat_hash_chains;
297 kstat_named_t arcstat_hash_chain_max;
298 kstat_named_t arcstat_p;
299 kstat_named_t arcstat_c;
300 kstat_named_t arcstat_c_min;
301 kstat_named_t arcstat_c_max;
302 kstat_named_t arcstat_size;
500445c0
PS
303 /*
304 * Number of bytes consumed by internal ARC structures necessary
305 * for tracking purposes; these structures are not actually
306 * backed by ARC buffers. This includes arc_buf_hdr_t structures
307 * (allocated via arc_buf_hdr_t_full and arc_buf_hdr_t_l2only
308 * caches), and arc_buf_t structures (allocated via arc_buf_t
309 * cache).
310 */
34dc7c2f 311 kstat_named_t arcstat_hdr_size;
500445c0
PS
312 /*
313 * Number of bytes consumed by ARC buffers of type equal to
314 * ARC_BUFC_DATA. This is generally consumed by buffers backing
315 * on disk user data (e.g. plain file contents).
316 */
d164b209 317 kstat_named_t arcstat_data_size;
500445c0
PS
318 /*
319 * Number of bytes consumed by ARC buffers of type equal to
320 * ARC_BUFC_METADATA. This is generally consumed by buffers
321 * backing on disk data that is used for internal ZFS
322 * structures (e.g. ZAP, dnode, indirect blocks, etc).
323 */
324 kstat_named_t arcstat_metadata_size;
325 /*
326 * Number of bytes consumed by various buffers and structures
327 * not actually backed with ARC buffers. This includes bonus
328 * buffers (allocated directly via zio_buf_* functions),
329 * dmu_buf_impl_t structures (allocated via dmu_buf_impl_t
330 * cache), and dnode_t structures (allocated via dnode_t cache).
331 */
d164b209 332 kstat_named_t arcstat_other_size;
500445c0
PS
333 /*
334 * Total number of bytes consumed by ARC buffers residing in the
335 * arc_anon state. This includes *all* buffers in the arc_anon
336 * state; e.g. data, metadata, evictable, and unevictable buffers
337 * are all included in this value.
338 */
13be560d 339 kstat_named_t arcstat_anon_size;
500445c0
PS
340 /*
341 * Number of bytes consumed by ARC buffers that meet the
342 * following criteria: backing buffers of type ARC_BUFC_DATA,
343 * residing in the arc_anon state, and are eligible for eviction
344 * (e.g. have no outstanding holds on the buffer).
345 */
346 kstat_named_t arcstat_anon_evictable_data;
347 /*
348 * Number of bytes consumed by ARC buffers that meet the
349 * following criteria: backing buffers of type ARC_BUFC_METADATA,
350 * residing in the arc_anon state, and are eligible for eviction
351 * (e.g. have no outstanding holds on the buffer).
352 */
353 kstat_named_t arcstat_anon_evictable_metadata;
354 /*
355 * Total number of bytes consumed by ARC buffers residing in the
356 * arc_mru state. This includes *all* buffers in the arc_mru
357 * state; e.g. data, metadata, evictable, and unevictable buffers
358 * are all included in this value.
359 */
13be560d 360 kstat_named_t arcstat_mru_size;
500445c0
PS
361 /*
362 * Number of bytes consumed by ARC buffers that meet the
363 * following criteria: backing buffers of type ARC_BUFC_DATA,
364 * residing in the arc_mru state, and are eligible for eviction
365 * (e.g. have no outstanding holds on the buffer).
366 */
367 kstat_named_t arcstat_mru_evictable_data;
368 /*
369 * Number of bytes consumed by ARC buffers that meet the
370 * following criteria: backing buffers of type ARC_BUFC_METADATA,
371 * residing in the arc_mru state, and are eligible for eviction
372 * (e.g. have no outstanding holds on the buffer).
373 */
374 kstat_named_t arcstat_mru_evictable_metadata;
375 /*
376 * Total number of bytes that *would have been* consumed by ARC
377 * buffers in the arc_mru_ghost state. The key thing to note
378 * here, is the fact that this size doesn't actually indicate
379 * RAM consumption. The ghost lists only consist of headers and
380 * don't actually have ARC buffers linked off of these headers.
381 * Thus, *if* the headers had associated ARC buffers, these
382 * buffers *would have* consumed this number of bytes.
383 */
13be560d 384 kstat_named_t arcstat_mru_ghost_size;
500445c0
PS
385 /*
386 * Number of bytes that *would have been* consumed by ARC
387 * buffers that are eligible for eviction, of type
388 * ARC_BUFC_DATA, and linked off the arc_mru_ghost state.
389 */
390 kstat_named_t arcstat_mru_ghost_evictable_data;
391 /*
392 * Number of bytes that *would have been* consumed by ARC
393 * buffers that are eligible for eviction, of type
394 * ARC_BUFC_METADATA, and linked off the arc_mru_ghost state.
395 */
396 kstat_named_t arcstat_mru_ghost_evictable_metadata;
397 /*
398 * Total number of bytes consumed by ARC buffers residing in the
399 * arc_mfu state. This includes *all* buffers in the arc_mfu
400 * state; e.g. data, metadata, evictable, and unevictable buffers
401 * are all included in this value.
402 */
13be560d 403 kstat_named_t arcstat_mfu_size;
500445c0
PS
404 /*
405 * Number of bytes consumed by ARC buffers that are eligible for
406 * eviction, of type ARC_BUFC_DATA, and reside in the arc_mfu
407 * state.
408 */
409 kstat_named_t arcstat_mfu_evictable_data;
410 /*
411 * Number of bytes consumed by ARC buffers that are eligible for
412 * eviction, of type ARC_BUFC_METADATA, and reside in the
413 * arc_mfu state.
414 */
415 kstat_named_t arcstat_mfu_evictable_metadata;
416 /*
417 * Total number of bytes that *would have been* consumed by ARC
418 * buffers in the arc_mfu_ghost state. See the comment above
419 * arcstat_mru_ghost_size for more details.
420 */
13be560d 421 kstat_named_t arcstat_mfu_ghost_size;
500445c0
PS
422 /*
423 * Number of bytes that *would have been* consumed by ARC
424 * buffers that are eligible for eviction, of type
425 * ARC_BUFC_DATA, and linked off the arc_mfu_ghost state.
426 */
427 kstat_named_t arcstat_mfu_ghost_evictable_data;
428 /*
429 * Number of bytes that *would have been* consumed by ARC
430 * buffers that are eligible for eviction, of type
431 * ARC_BUFC_METADATA, and linked off the arc_mru_ghost state.
432 */
433 kstat_named_t arcstat_mfu_ghost_evictable_metadata;
34dc7c2f
BB
434 kstat_named_t arcstat_l2_hits;
435 kstat_named_t arcstat_l2_misses;
436 kstat_named_t arcstat_l2_feeds;
437 kstat_named_t arcstat_l2_rw_clash;
d164b209
BB
438 kstat_named_t arcstat_l2_read_bytes;
439 kstat_named_t arcstat_l2_write_bytes;
34dc7c2f
BB
440 kstat_named_t arcstat_l2_writes_sent;
441 kstat_named_t arcstat_l2_writes_done;
442 kstat_named_t arcstat_l2_writes_error;
ca0bf58d 443 kstat_named_t arcstat_l2_writes_lock_retry;
34dc7c2f
BB
444 kstat_named_t arcstat_l2_evict_lock_retry;
445 kstat_named_t arcstat_l2_evict_reading;
b9541d6b 446 kstat_named_t arcstat_l2_evict_l1cached;
34dc7c2f 447 kstat_named_t arcstat_l2_free_on_write;
ca0bf58d 448 kstat_named_t arcstat_l2_cdata_free_on_write;
34dc7c2f
BB
449 kstat_named_t arcstat_l2_abort_lowmem;
450 kstat_named_t arcstat_l2_cksum_bad;
451 kstat_named_t arcstat_l2_io_error;
452 kstat_named_t arcstat_l2_size;
3a17a7a9 453 kstat_named_t arcstat_l2_asize;
34dc7c2f 454 kstat_named_t arcstat_l2_hdr_size;
3a17a7a9
SK
455 kstat_named_t arcstat_l2_compress_successes;
456 kstat_named_t arcstat_l2_compress_zeros;
457 kstat_named_t arcstat_l2_compress_failures;
34dc7c2f 458 kstat_named_t arcstat_memory_throttle_count;
1eb5bfa3
GW
459 kstat_named_t arcstat_duplicate_buffers;
460 kstat_named_t arcstat_duplicate_buffers_size;
461 kstat_named_t arcstat_duplicate_reads;
7cb67b45
BB
462 kstat_named_t arcstat_memory_direct_count;
463 kstat_named_t arcstat_memory_indirect_count;
1834f2d8
BB
464 kstat_named_t arcstat_no_grow;
465 kstat_named_t arcstat_tempreserve;
466 kstat_named_t arcstat_loaned_bytes;
ab26409d 467 kstat_named_t arcstat_prune;
1834f2d8
BB
468 kstat_named_t arcstat_meta_used;
469 kstat_named_t arcstat_meta_limit;
470 kstat_named_t arcstat_meta_max;
ca0bf58d 471 kstat_named_t arcstat_meta_min;
34dc7c2f
BB
472} arc_stats_t;
473
474static arc_stats_t arc_stats = {
475 { "hits", KSTAT_DATA_UINT64 },
476 { "misses", KSTAT_DATA_UINT64 },
477 { "demand_data_hits", KSTAT_DATA_UINT64 },
478 { "demand_data_misses", KSTAT_DATA_UINT64 },
479 { "demand_metadata_hits", KSTAT_DATA_UINT64 },
480 { "demand_metadata_misses", KSTAT_DATA_UINT64 },
481 { "prefetch_data_hits", KSTAT_DATA_UINT64 },
482 { "prefetch_data_misses", KSTAT_DATA_UINT64 },
483 { "prefetch_metadata_hits", KSTAT_DATA_UINT64 },
484 { "prefetch_metadata_misses", KSTAT_DATA_UINT64 },
485 { "mru_hits", KSTAT_DATA_UINT64 },
486 { "mru_ghost_hits", KSTAT_DATA_UINT64 },
487 { "mfu_hits", KSTAT_DATA_UINT64 },
488 { "mfu_ghost_hits", KSTAT_DATA_UINT64 },
489 { "deleted", KSTAT_DATA_UINT64 },
34dc7c2f
BB
490 { "mutex_miss", KSTAT_DATA_UINT64 },
491 { "evict_skip", KSTAT_DATA_UINT64 },
ca0bf58d 492 { "evict_not_enough", KSTAT_DATA_UINT64 },
428870ff
BB
493 { "evict_l2_cached", KSTAT_DATA_UINT64 },
494 { "evict_l2_eligible", KSTAT_DATA_UINT64 },
495 { "evict_l2_ineligible", KSTAT_DATA_UINT64 },
ca0bf58d 496 { "evict_l2_skip", KSTAT_DATA_UINT64 },
34dc7c2f
BB
497 { "hash_elements", KSTAT_DATA_UINT64 },
498 { "hash_elements_max", KSTAT_DATA_UINT64 },
499 { "hash_collisions", KSTAT_DATA_UINT64 },
500 { "hash_chains", KSTAT_DATA_UINT64 },
501 { "hash_chain_max", KSTAT_DATA_UINT64 },
502 { "p", KSTAT_DATA_UINT64 },
503 { "c", KSTAT_DATA_UINT64 },
504 { "c_min", KSTAT_DATA_UINT64 },
505 { "c_max", KSTAT_DATA_UINT64 },
506 { "size", KSTAT_DATA_UINT64 },
507 { "hdr_size", KSTAT_DATA_UINT64 },
d164b209 508 { "data_size", KSTAT_DATA_UINT64 },
500445c0 509 { "metadata_size", KSTAT_DATA_UINT64 },
d164b209 510 { "other_size", KSTAT_DATA_UINT64 },
13be560d 511 { "anon_size", KSTAT_DATA_UINT64 },
500445c0
PS
512 { "anon_evictable_data", KSTAT_DATA_UINT64 },
513 { "anon_evictable_metadata", KSTAT_DATA_UINT64 },
13be560d 514 { "mru_size", KSTAT_DATA_UINT64 },
500445c0
PS
515 { "mru_evictable_data", KSTAT_DATA_UINT64 },
516 { "mru_evictable_metadata", KSTAT_DATA_UINT64 },
13be560d 517 { "mru_ghost_size", KSTAT_DATA_UINT64 },
500445c0
PS
518 { "mru_ghost_evictable_data", KSTAT_DATA_UINT64 },
519 { "mru_ghost_evictable_metadata", KSTAT_DATA_UINT64 },
13be560d 520 { "mfu_size", KSTAT_DATA_UINT64 },
500445c0
PS
521 { "mfu_evictable_data", KSTAT_DATA_UINT64 },
522 { "mfu_evictable_metadata", KSTAT_DATA_UINT64 },
13be560d 523 { "mfu_ghost_size", KSTAT_DATA_UINT64 },
500445c0
PS
524 { "mfu_ghost_evictable_data", KSTAT_DATA_UINT64 },
525 { "mfu_ghost_evictable_metadata", KSTAT_DATA_UINT64 },
34dc7c2f
BB
526 { "l2_hits", KSTAT_DATA_UINT64 },
527 { "l2_misses", KSTAT_DATA_UINT64 },
528 { "l2_feeds", KSTAT_DATA_UINT64 },
529 { "l2_rw_clash", KSTAT_DATA_UINT64 },
d164b209
BB
530 { "l2_read_bytes", KSTAT_DATA_UINT64 },
531 { "l2_write_bytes", KSTAT_DATA_UINT64 },
34dc7c2f
BB
532 { "l2_writes_sent", KSTAT_DATA_UINT64 },
533 { "l2_writes_done", KSTAT_DATA_UINT64 },
534 { "l2_writes_error", KSTAT_DATA_UINT64 },
ca0bf58d 535 { "l2_writes_lock_retry", KSTAT_DATA_UINT64 },
34dc7c2f
BB
536 { "l2_evict_lock_retry", KSTAT_DATA_UINT64 },
537 { "l2_evict_reading", KSTAT_DATA_UINT64 },
b9541d6b 538 { "l2_evict_l1cached", KSTAT_DATA_UINT64 },
34dc7c2f 539 { "l2_free_on_write", KSTAT_DATA_UINT64 },
ca0bf58d 540 { "l2_cdata_free_on_write", KSTAT_DATA_UINT64 },
34dc7c2f
BB
541 { "l2_abort_lowmem", KSTAT_DATA_UINT64 },
542 { "l2_cksum_bad", KSTAT_DATA_UINT64 },
543 { "l2_io_error", KSTAT_DATA_UINT64 },
544 { "l2_size", KSTAT_DATA_UINT64 },
3a17a7a9 545 { "l2_asize", KSTAT_DATA_UINT64 },
34dc7c2f 546 { "l2_hdr_size", KSTAT_DATA_UINT64 },
3a17a7a9
SK
547 { "l2_compress_successes", KSTAT_DATA_UINT64 },
548 { "l2_compress_zeros", KSTAT_DATA_UINT64 },
549 { "l2_compress_failures", KSTAT_DATA_UINT64 },
1834f2d8 550 { "memory_throttle_count", KSTAT_DATA_UINT64 },
1eb5bfa3
GW
551 { "duplicate_buffers", KSTAT_DATA_UINT64 },
552 { "duplicate_buffers_size", KSTAT_DATA_UINT64 },
553 { "duplicate_reads", KSTAT_DATA_UINT64 },
7cb67b45
BB
554 { "memory_direct_count", KSTAT_DATA_UINT64 },
555 { "memory_indirect_count", KSTAT_DATA_UINT64 },
1834f2d8
BB
556 { "arc_no_grow", KSTAT_DATA_UINT64 },
557 { "arc_tempreserve", KSTAT_DATA_UINT64 },
558 { "arc_loaned_bytes", KSTAT_DATA_UINT64 },
ab26409d 559 { "arc_prune", KSTAT_DATA_UINT64 },
1834f2d8
BB
560 { "arc_meta_used", KSTAT_DATA_UINT64 },
561 { "arc_meta_limit", KSTAT_DATA_UINT64 },
562 { "arc_meta_max", KSTAT_DATA_UINT64 },
500445c0 563 { "arc_meta_min", KSTAT_DATA_UINT64 }
34dc7c2f
BB
564};
565
566#define ARCSTAT(stat) (arc_stats.stat.value.ui64)
567
568#define ARCSTAT_INCR(stat, val) \
d3cc8b15 569 atomic_add_64(&arc_stats.stat.value.ui64, (val))
34dc7c2f 570
428870ff 571#define ARCSTAT_BUMP(stat) ARCSTAT_INCR(stat, 1)
34dc7c2f
BB
572#define ARCSTAT_BUMPDOWN(stat) ARCSTAT_INCR(stat, -1)
573
574#define ARCSTAT_MAX(stat, val) { \
575 uint64_t m; \
576 while ((val) > (m = arc_stats.stat.value.ui64) && \
577 (m != atomic_cas_64(&arc_stats.stat.value.ui64, m, (val)))) \
578 continue; \
579}
580
581#define ARCSTAT_MAXSTAT(stat) \
582 ARCSTAT_MAX(stat##_max, arc_stats.stat.value.ui64)
583
584/*
585 * We define a macro to allow ARC hits/misses to be easily broken down by
586 * two separate conditions, giving a total of four different subtypes for
587 * each of hits and misses (so eight statistics total).
588 */
589#define ARCSTAT_CONDSTAT(cond1, stat1, notstat1, cond2, stat2, notstat2, stat) \
590 if (cond1) { \
591 if (cond2) { \
592 ARCSTAT_BUMP(arcstat_##stat1##_##stat2##_##stat); \
593 } else { \
594 ARCSTAT_BUMP(arcstat_##stat1##_##notstat2##_##stat); \
595 } \
596 } else { \
597 if (cond2) { \
598 ARCSTAT_BUMP(arcstat_##notstat1##_##stat2##_##stat); \
599 } else { \
600 ARCSTAT_BUMP(arcstat_##notstat1##_##notstat2##_##stat);\
601 } \
602 }
603
604kstat_t *arc_ksp;
428870ff 605static arc_state_t *arc_anon;
34dc7c2f
BB
606static arc_state_t *arc_mru;
607static arc_state_t *arc_mru_ghost;
608static arc_state_t *arc_mfu;
609static arc_state_t *arc_mfu_ghost;
610static arc_state_t *arc_l2c_only;
611
612/*
613 * There are several ARC variables that are critical to export as kstats --
614 * but we don't want to have to grovel around in the kstat whenever we wish to
615 * manipulate them. For these variables, we therefore define them to be in
616 * terms of the statistic variable. This assures that we are not introducing
617 * the possibility of inconsistency by having shadow copies of the variables,
618 * while still allowing the code to be readable.
619 */
620#define arc_size ARCSTAT(arcstat_size) /* actual total arc size */
621#define arc_p ARCSTAT(arcstat_p) /* target size of MRU */
622#define arc_c ARCSTAT(arcstat_c) /* target size of cache */
623#define arc_c_min ARCSTAT(arcstat_c_min) /* min target cache size */
624#define arc_c_max ARCSTAT(arcstat_c_max) /* max target cache size */
1834f2d8
BB
625#define arc_no_grow ARCSTAT(arcstat_no_grow)
626#define arc_tempreserve ARCSTAT(arcstat_tempreserve)
627#define arc_loaned_bytes ARCSTAT(arcstat_loaned_bytes)
23c0a133 628#define arc_meta_limit ARCSTAT(arcstat_meta_limit) /* max size for metadata */
ca0bf58d 629#define arc_meta_min ARCSTAT(arcstat_meta_min) /* min size for metadata */
23c0a133
GW
630#define arc_meta_used ARCSTAT(arcstat_meta_used) /* size of metadata */
631#define arc_meta_max ARCSTAT(arcstat_meta_max) /* max size of metadata */
34dc7c2f 632
3a17a7a9
SK
633#define L2ARC_IS_VALID_COMPRESS(_c_) \
634 ((_c_) == ZIO_COMPRESS_LZ4 || (_c_) == ZIO_COMPRESS_EMPTY)
635
ab26409d
BB
636static list_t arc_prune_list;
637static kmutex_t arc_prune_mtx;
f6046738 638static taskq_t *arc_prune_taskq;
34dc7c2f 639static arc_buf_t *arc_eviction_list;
34dc7c2f 640static arc_buf_hdr_t arc_eviction_hdr;
428870ff 641
34dc7c2f
BB
642#define GHOST_STATE(state) \
643 ((state) == arc_mru_ghost || (state) == arc_mfu_ghost || \
644 (state) == arc_l2c_only)
645
2a432414
GW
646#define HDR_IN_HASH_TABLE(hdr) ((hdr)->b_flags & ARC_FLAG_IN_HASH_TABLE)
647#define HDR_IO_IN_PROGRESS(hdr) ((hdr)->b_flags & ARC_FLAG_IO_IN_PROGRESS)
648#define HDR_IO_ERROR(hdr) ((hdr)->b_flags & ARC_FLAG_IO_ERROR)
649#define HDR_PREFETCH(hdr) ((hdr)->b_flags & ARC_FLAG_PREFETCH)
650#define HDR_FREED_IN_READ(hdr) ((hdr)->b_flags & ARC_FLAG_FREED_IN_READ)
651#define HDR_BUF_AVAILABLE(hdr) ((hdr)->b_flags & ARC_FLAG_BUF_AVAILABLE)
b9541d6b 652
2a432414 653#define HDR_L2CACHE(hdr) ((hdr)->b_flags & ARC_FLAG_L2CACHE)
b9541d6b 654#define HDR_L2COMPRESS(hdr) ((hdr)->b_flags & ARC_FLAG_L2COMPRESS)
2a432414 655#define HDR_L2_READING(hdr) \
b9541d6b
CW
656 (((hdr)->b_flags & ARC_FLAG_IO_IN_PROGRESS) && \
657 ((hdr)->b_flags & ARC_FLAG_HAS_L2HDR))
2a432414
GW
658#define HDR_L2_WRITING(hdr) ((hdr)->b_flags & ARC_FLAG_L2_WRITING)
659#define HDR_L2_EVICTED(hdr) ((hdr)->b_flags & ARC_FLAG_L2_EVICTED)
660#define HDR_L2_WRITE_HEAD(hdr) ((hdr)->b_flags & ARC_FLAG_L2_WRITE_HEAD)
34dc7c2f 661
b9541d6b
CW
662#define HDR_ISTYPE_METADATA(hdr) \
663 ((hdr)->b_flags & ARC_FLAG_BUFC_METADATA)
664#define HDR_ISTYPE_DATA(hdr) (!HDR_ISTYPE_METADATA(hdr))
665
666#define HDR_HAS_L1HDR(hdr) ((hdr)->b_flags & ARC_FLAG_HAS_L1HDR)
667#define HDR_HAS_L2HDR(hdr) ((hdr)->b_flags & ARC_FLAG_HAS_L2HDR)
668
669/* For storing compression mode in b_flags */
670#define HDR_COMPRESS_OFFSET 24
671#define HDR_COMPRESS_NBITS 7
672
673#define HDR_GET_COMPRESS(hdr) ((enum zio_compress)BF32_GET(hdr->b_flags, \
674 HDR_COMPRESS_OFFSET, HDR_COMPRESS_NBITS))
675#define HDR_SET_COMPRESS(hdr, cmp) BF32_SET(hdr->b_flags, \
676 HDR_COMPRESS_OFFSET, HDR_COMPRESS_NBITS, (cmp))
677
34dc7c2f
BB
678/*
679 * Other sizes
680 */
681
b9541d6b
CW
682#define HDR_FULL_SIZE ((int64_t)sizeof (arc_buf_hdr_t))
683#define HDR_L2ONLY_SIZE ((int64_t)offsetof(arc_buf_hdr_t, b_l1hdr))
34dc7c2f
BB
684
685/*
686 * Hash table routines
687 */
688
00b46022
BB
689#define HT_LOCK_ALIGN 64
690#define HT_LOCK_PAD (P2NPHASE(sizeof (kmutex_t), (HT_LOCK_ALIGN)))
34dc7c2f
BB
691
692struct ht_lock {
693 kmutex_t ht_lock;
694#ifdef _KERNEL
00b46022 695 unsigned char pad[HT_LOCK_PAD];
34dc7c2f
BB
696#endif
697};
698
b31d8ea7 699#define BUF_LOCKS 8192
34dc7c2f
BB
700typedef struct buf_hash_table {
701 uint64_t ht_mask;
702 arc_buf_hdr_t **ht_table;
703 struct ht_lock ht_locks[BUF_LOCKS];
704} buf_hash_table_t;
705
706static buf_hash_table_t buf_hash_table;
707
708#define BUF_HASH_INDEX(spa, dva, birth) \
709 (buf_hash(spa, dva, birth) & buf_hash_table.ht_mask)
710#define BUF_HASH_LOCK_NTRY(idx) (buf_hash_table.ht_locks[idx & (BUF_LOCKS-1)])
711#define BUF_HASH_LOCK(idx) (&(BUF_HASH_LOCK_NTRY(idx).ht_lock))
428870ff
BB
712#define HDR_LOCK(hdr) \
713 (BUF_HASH_LOCK(BUF_HASH_INDEX(hdr->b_spa, &hdr->b_dva, hdr->b_birth)))
34dc7c2f
BB
714
715uint64_t zfs_crc64_table[256];
716
717/*
718 * Level 2 ARC
719 */
720
721#define L2ARC_WRITE_SIZE (8 * 1024 * 1024) /* initial write max */
3a17a7a9
SK
722#define L2ARC_HEADROOM 2 /* num of writes */
723/*
724 * If we discover during ARC scan any buffers to be compressed, we boost
725 * our headroom for the next scanning cycle by this percentage multiple.
726 */
727#define L2ARC_HEADROOM_BOOST 200
d164b209
BB
728#define L2ARC_FEED_SECS 1 /* caching interval secs */
729#define L2ARC_FEED_MIN_MS 200 /* min caching interval ms */
34dc7c2f 730
d962d5da
PS
731/*
732 * Used to distinguish headers that are being process by
733 * l2arc_write_buffers(), but have yet to be assigned to a l2arc disk
734 * address. This can happen when the header is added to the l2arc's list
735 * of buffers to write in the first stage of l2arc_write_buffers(), but
736 * has not yet been written out which happens in the second stage of
737 * l2arc_write_buffers().
738 */
739#define L2ARC_ADDR_UNSET ((uint64_t)(-1))
740
34dc7c2f
BB
741#define l2arc_writes_sent ARCSTAT(arcstat_l2_writes_sent)
742#define l2arc_writes_done ARCSTAT(arcstat_l2_writes_done)
743
d3cc8b15 744/* L2ARC Performance Tunables */
abd8610c
BB
745unsigned long l2arc_write_max = L2ARC_WRITE_SIZE; /* def max write size */
746unsigned long l2arc_write_boost = L2ARC_WRITE_SIZE; /* extra warmup write */
747unsigned long l2arc_headroom = L2ARC_HEADROOM; /* # of dev writes */
3a17a7a9 748unsigned long l2arc_headroom_boost = L2ARC_HEADROOM_BOOST;
abd8610c
BB
749unsigned long l2arc_feed_secs = L2ARC_FEED_SECS; /* interval seconds */
750unsigned long l2arc_feed_min_ms = L2ARC_FEED_MIN_MS; /* min interval msecs */
751int l2arc_noprefetch = B_TRUE; /* don't cache prefetch bufs */
3a17a7a9 752int l2arc_nocompress = B_FALSE; /* don't compress bufs */
abd8610c 753int l2arc_feed_again = B_TRUE; /* turbo warmup */
c93504f0 754int l2arc_norw = B_FALSE; /* no reads during writes */
34dc7c2f
BB
755
756/*
757 * L2ARC Internals
758 */
34dc7c2f
BB
759static list_t L2ARC_dev_list; /* device list */
760static list_t *l2arc_dev_list; /* device list pointer */
761static kmutex_t l2arc_dev_mtx; /* device list mutex */
762static l2arc_dev_t *l2arc_dev_last; /* last device used */
34dc7c2f
BB
763static list_t L2ARC_free_on_write; /* free after write buf list */
764static list_t *l2arc_free_on_write; /* free after write list ptr */
765static kmutex_t l2arc_free_on_write_mtx; /* mutex for list */
766static uint64_t l2arc_ndev; /* number of devices */
767
768typedef struct l2arc_read_callback {
3a17a7a9
SK
769 arc_buf_t *l2rcb_buf; /* read buffer */
770 spa_t *l2rcb_spa; /* spa */
771 blkptr_t l2rcb_bp; /* original blkptr */
5dbd68a3 772 zbookmark_phys_t l2rcb_zb; /* original bookmark */
3a17a7a9
SK
773 int l2rcb_flags; /* original flags */
774 enum zio_compress l2rcb_compress; /* applied compress */
34dc7c2f
BB
775} l2arc_read_callback_t;
776
34dc7c2f
BB
777typedef struct l2arc_data_free {
778 /* protected by l2arc_free_on_write_mtx */
779 void *l2df_data;
780 size_t l2df_size;
781 void (*l2df_func)(void *, size_t);
782 list_node_t l2df_list_node;
783} l2arc_data_free_t;
784
785static kmutex_t l2arc_feed_thr_lock;
786static kcondvar_t l2arc_feed_thr_cv;
787static uint8_t l2arc_thread_exit;
788
2a432414
GW
789static void arc_get_data_buf(arc_buf_t *);
790static void arc_access(arc_buf_hdr_t *, kmutex_t *);
ca0bf58d 791static boolean_t arc_is_overflowing(void);
2a432414 792static void arc_buf_watch(arc_buf_t *);
ca67b33a 793static void arc_tuning_update(void);
2a432414 794
b9541d6b
CW
795static arc_buf_contents_t arc_buf_type(arc_buf_hdr_t *);
796static uint32_t arc_bufc_to_flags(arc_buf_contents_t);
797
2a432414
GW
798static boolean_t l2arc_write_eligible(uint64_t, arc_buf_hdr_t *);
799static void l2arc_read_done(zio_t *);
34dc7c2f 800
b9541d6b 801static boolean_t l2arc_compress_buf(arc_buf_hdr_t *);
2a432414
GW
802static void l2arc_decompress_zio(zio_t *, arc_buf_hdr_t *, enum zio_compress);
803static void l2arc_release_cdata_buf(arc_buf_hdr_t *);
3a17a7a9 804
34dc7c2f 805static uint64_t
d164b209 806buf_hash(uint64_t spa, const dva_t *dva, uint64_t birth)
34dc7c2f 807{
34dc7c2f
BB
808 uint8_t *vdva = (uint8_t *)dva;
809 uint64_t crc = -1ULL;
810 int i;
811
812 ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY);
813
814 for (i = 0; i < sizeof (dva_t); i++)
815 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ vdva[i]) & 0xFF];
816
d164b209 817 crc ^= (spa>>8) ^ birth;
34dc7c2f
BB
818
819 return (crc);
820}
821
822#define BUF_EMPTY(buf) \
823 ((buf)->b_dva.dva_word[0] == 0 && \
b9541d6b 824 (buf)->b_dva.dva_word[1] == 0)
34dc7c2f
BB
825
826#define BUF_EQUAL(spa, dva, birth, buf) \
827 ((buf)->b_dva.dva_word[0] == (dva)->dva_word[0]) && \
828 ((buf)->b_dva.dva_word[1] == (dva)->dva_word[1]) && \
829 ((buf)->b_birth == birth) && ((buf)->b_spa == spa)
830
428870ff
BB
831static void
832buf_discard_identity(arc_buf_hdr_t *hdr)
833{
834 hdr->b_dva.dva_word[0] = 0;
835 hdr->b_dva.dva_word[1] = 0;
836 hdr->b_birth = 0;
428870ff
BB
837}
838
34dc7c2f 839static arc_buf_hdr_t *
9b67f605 840buf_hash_find(uint64_t spa, const blkptr_t *bp, kmutex_t **lockp)
34dc7c2f 841{
9b67f605
MA
842 const dva_t *dva = BP_IDENTITY(bp);
843 uint64_t birth = BP_PHYSICAL_BIRTH(bp);
34dc7c2f
BB
844 uint64_t idx = BUF_HASH_INDEX(spa, dva, birth);
845 kmutex_t *hash_lock = BUF_HASH_LOCK(idx);
2a432414 846 arc_buf_hdr_t *hdr;
34dc7c2f
BB
847
848 mutex_enter(hash_lock);
2a432414
GW
849 for (hdr = buf_hash_table.ht_table[idx]; hdr != NULL;
850 hdr = hdr->b_hash_next) {
851 if (BUF_EQUAL(spa, dva, birth, hdr)) {
34dc7c2f 852 *lockp = hash_lock;
2a432414 853 return (hdr);
34dc7c2f
BB
854 }
855 }
856 mutex_exit(hash_lock);
857 *lockp = NULL;
858 return (NULL);
859}
860
861/*
862 * Insert an entry into the hash table. If there is already an element
863 * equal to elem in the hash table, then the already existing element
864 * will be returned and the new element will not be inserted.
865 * Otherwise returns NULL.
b9541d6b 866 * If lockp == NULL, the caller is assumed to already hold the hash lock.
34dc7c2f
BB
867 */
868static arc_buf_hdr_t *
2a432414 869buf_hash_insert(arc_buf_hdr_t *hdr, kmutex_t **lockp)
34dc7c2f 870{
2a432414 871 uint64_t idx = BUF_HASH_INDEX(hdr->b_spa, &hdr->b_dva, hdr->b_birth);
34dc7c2f 872 kmutex_t *hash_lock = BUF_HASH_LOCK(idx);
2a432414 873 arc_buf_hdr_t *fhdr;
34dc7c2f
BB
874 uint32_t i;
875
2a432414
GW
876 ASSERT(!DVA_IS_EMPTY(&hdr->b_dva));
877 ASSERT(hdr->b_birth != 0);
878 ASSERT(!HDR_IN_HASH_TABLE(hdr));
b9541d6b
CW
879
880 if (lockp != NULL) {
881 *lockp = hash_lock;
882 mutex_enter(hash_lock);
883 } else {
884 ASSERT(MUTEX_HELD(hash_lock));
885 }
886
2a432414
GW
887 for (fhdr = buf_hash_table.ht_table[idx], i = 0; fhdr != NULL;
888 fhdr = fhdr->b_hash_next, i++) {
889 if (BUF_EQUAL(hdr->b_spa, &hdr->b_dva, hdr->b_birth, fhdr))
890 return (fhdr);
34dc7c2f
BB
891 }
892
2a432414
GW
893 hdr->b_hash_next = buf_hash_table.ht_table[idx];
894 buf_hash_table.ht_table[idx] = hdr;
895 hdr->b_flags |= ARC_FLAG_IN_HASH_TABLE;
34dc7c2f
BB
896
897 /* collect some hash table performance data */
898 if (i > 0) {
899 ARCSTAT_BUMP(arcstat_hash_collisions);
900 if (i == 1)
901 ARCSTAT_BUMP(arcstat_hash_chains);
902
903 ARCSTAT_MAX(arcstat_hash_chain_max, i);
904 }
905
906 ARCSTAT_BUMP(arcstat_hash_elements);
907 ARCSTAT_MAXSTAT(arcstat_hash_elements);
908
909 return (NULL);
910}
911
912static void
2a432414 913buf_hash_remove(arc_buf_hdr_t *hdr)
34dc7c2f 914{
2a432414
GW
915 arc_buf_hdr_t *fhdr, **hdrp;
916 uint64_t idx = BUF_HASH_INDEX(hdr->b_spa, &hdr->b_dva, hdr->b_birth);
34dc7c2f
BB
917
918 ASSERT(MUTEX_HELD(BUF_HASH_LOCK(idx)));
2a432414 919 ASSERT(HDR_IN_HASH_TABLE(hdr));
34dc7c2f 920
2a432414
GW
921 hdrp = &buf_hash_table.ht_table[idx];
922 while ((fhdr = *hdrp) != hdr) {
923 ASSERT(fhdr != NULL);
924 hdrp = &fhdr->b_hash_next;
34dc7c2f 925 }
2a432414
GW
926 *hdrp = hdr->b_hash_next;
927 hdr->b_hash_next = NULL;
928 hdr->b_flags &= ~ARC_FLAG_IN_HASH_TABLE;
34dc7c2f
BB
929
930 /* collect some hash table performance data */
931 ARCSTAT_BUMPDOWN(arcstat_hash_elements);
932
933 if (buf_hash_table.ht_table[idx] &&
934 buf_hash_table.ht_table[idx]->b_hash_next == NULL)
935 ARCSTAT_BUMPDOWN(arcstat_hash_chains);
936}
937
938/*
939 * Global data structures and functions for the buf kmem cache.
940 */
b9541d6b
CW
941static kmem_cache_t *hdr_full_cache;
942static kmem_cache_t *hdr_l2only_cache;
34dc7c2f
BB
943static kmem_cache_t *buf_cache;
944
945static void
946buf_fini(void)
947{
948 int i;
949
00b46022 950#if defined(_KERNEL) && defined(HAVE_SPL)
d1d7e268
MK
951 /*
952 * Large allocations which do not require contiguous pages
953 * should be using vmem_free() in the linux kernel\
954 */
00b46022
BB
955 vmem_free(buf_hash_table.ht_table,
956 (buf_hash_table.ht_mask + 1) * sizeof (void *));
957#else
34dc7c2f
BB
958 kmem_free(buf_hash_table.ht_table,
959 (buf_hash_table.ht_mask + 1) * sizeof (void *));
00b46022 960#endif
34dc7c2f
BB
961 for (i = 0; i < BUF_LOCKS; i++)
962 mutex_destroy(&buf_hash_table.ht_locks[i].ht_lock);
b9541d6b
CW
963 kmem_cache_destroy(hdr_full_cache);
964 kmem_cache_destroy(hdr_l2only_cache);
34dc7c2f
BB
965 kmem_cache_destroy(buf_cache);
966}
967
968/*
969 * Constructor callback - called when the cache is empty
970 * and a new buf is requested.
971 */
972/* ARGSUSED */
973static int
b9541d6b
CW
974hdr_full_cons(void *vbuf, void *unused, int kmflag)
975{
976 arc_buf_hdr_t *hdr = vbuf;
977
978 bzero(hdr, HDR_FULL_SIZE);
979 cv_init(&hdr->b_l1hdr.b_cv, NULL, CV_DEFAULT, NULL);
980 refcount_create(&hdr->b_l1hdr.b_refcnt);
981 mutex_init(&hdr->b_l1hdr.b_freeze_lock, NULL, MUTEX_DEFAULT, NULL);
982 list_link_init(&hdr->b_l1hdr.b_arc_node);
983 list_link_init(&hdr->b_l2hdr.b_l2node);
ca0bf58d 984 multilist_link_init(&hdr->b_l1hdr.b_arc_node);
b9541d6b
CW
985 arc_space_consume(HDR_FULL_SIZE, ARC_SPACE_HDRS);
986
987 return (0);
988}
989
990/* ARGSUSED */
991static int
992hdr_l2only_cons(void *vbuf, void *unused, int kmflag)
34dc7c2f 993{
2a432414
GW
994 arc_buf_hdr_t *hdr = vbuf;
995
b9541d6b
CW
996 bzero(hdr, HDR_L2ONLY_SIZE);
997 arc_space_consume(HDR_L2ONLY_SIZE, ARC_SPACE_L2HDRS);
34dc7c2f 998
34dc7c2f
BB
999 return (0);
1000}
1001
b128c09f
BB
1002/* ARGSUSED */
1003static int
1004buf_cons(void *vbuf, void *unused, int kmflag)
1005{
1006 arc_buf_t *buf = vbuf;
1007
1008 bzero(buf, sizeof (arc_buf_t));
428870ff 1009 mutex_init(&buf->b_evict_lock, NULL, MUTEX_DEFAULT, NULL);
d164b209
BB
1010 arc_space_consume(sizeof (arc_buf_t), ARC_SPACE_HDRS);
1011
b128c09f
BB
1012 return (0);
1013}
1014
34dc7c2f
BB
1015/*
1016 * Destructor callback - called when a cached buf is
1017 * no longer required.
1018 */
1019/* ARGSUSED */
1020static void
b9541d6b 1021hdr_full_dest(void *vbuf, void *unused)
34dc7c2f 1022{
2a432414 1023 arc_buf_hdr_t *hdr = vbuf;
34dc7c2f 1024
2a432414 1025 ASSERT(BUF_EMPTY(hdr));
b9541d6b
CW
1026 cv_destroy(&hdr->b_l1hdr.b_cv);
1027 refcount_destroy(&hdr->b_l1hdr.b_refcnt);
1028 mutex_destroy(&hdr->b_l1hdr.b_freeze_lock);
ca0bf58d 1029 ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node));
b9541d6b
CW
1030 arc_space_return(HDR_FULL_SIZE, ARC_SPACE_HDRS);
1031}
1032
1033/* ARGSUSED */
1034static void
1035hdr_l2only_dest(void *vbuf, void *unused)
1036{
1037 ASSERTV(arc_buf_hdr_t *hdr = vbuf);
1038
1039 ASSERT(BUF_EMPTY(hdr));
1040 arc_space_return(HDR_L2ONLY_SIZE, ARC_SPACE_L2HDRS);
34dc7c2f
BB
1041}
1042
b128c09f
BB
1043/* ARGSUSED */
1044static void
1045buf_dest(void *vbuf, void *unused)
1046{
1047 arc_buf_t *buf = vbuf;
1048
428870ff 1049 mutex_destroy(&buf->b_evict_lock);
d164b209 1050 arc_space_return(sizeof (arc_buf_t), ARC_SPACE_HDRS);
b128c09f
BB
1051}
1052
34dc7c2f
BB
1053static void
1054buf_init(void)
1055{
1056 uint64_t *ct;
1057 uint64_t hsize = 1ULL << 12;
1058 int i, j;
1059
1060 /*
1061 * The hash table is big enough to fill all of physical memory
49ddb315
MA
1062 * with an average block size of zfs_arc_average_blocksize (default 8K).
1063 * By default, the table will take up
1064 * totalmem * sizeof(void*) / 8K (1MB per GB with 8-byte pointers).
34dc7c2f 1065 */
49ddb315 1066 while (hsize * zfs_arc_average_blocksize < physmem * PAGESIZE)
34dc7c2f
BB
1067 hsize <<= 1;
1068retry:
1069 buf_hash_table.ht_mask = hsize - 1;
00b46022 1070#if defined(_KERNEL) && defined(HAVE_SPL)
d1d7e268
MK
1071 /*
1072 * Large allocations which do not require contiguous pages
1073 * should be using vmem_alloc() in the linux kernel
1074 */
00b46022
BB
1075 buf_hash_table.ht_table =
1076 vmem_zalloc(hsize * sizeof (void*), KM_SLEEP);
1077#else
34dc7c2f
BB
1078 buf_hash_table.ht_table =
1079 kmem_zalloc(hsize * sizeof (void*), KM_NOSLEEP);
00b46022 1080#endif
34dc7c2f
BB
1081 if (buf_hash_table.ht_table == NULL) {
1082 ASSERT(hsize > (1ULL << 8));
1083 hsize >>= 1;
1084 goto retry;
1085 }
1086
b9541d6b
CW
1087 hdr_full_cache = kmem_cache_create("arc_buf_hdr_t_full", HDR_FULL_SIZE,
1088 0, hdr_full_cons, hdr_full_dest, NULL, NULL, NULL, 0);
1089 hdr_l2only_cache = kmem_cache_create("arc_buf_hdr_t_l2only",
1090 HDR_L2ONLY_SIZE, 0, hdr_l2only_cons, hdr_l2only_dest, NULL,
1091 NULL, NULL, 0);
34dc7c2f 1092 buf_cache = kmem_cache_create("arc_buf_t", sizeof (arc_buf_t),
b128c09f 1093 0, buf_cons, buf_dest, NULL, NULL, NULL, 0);
34dc7c2f
BB
1094
1095 for (i = 0; i < 256; i++)
1096 for (ct = zfs_crc64_table + i, *ct = i, j = 8; j > 0; j--)
1097 *ct = (*ct >> 1) ^ (-(*ct & 1) & ZFS_CRC64_POLY);
1098
1099 for (i = 0; i < BUF_LOCKS; i++) {
1100 mutex_init(&buf_hash_table.ht_locks[i].ht_lock,
40d06e3c 1101 NULL, MUTEX_DEFAULT, NULL);
34dc7c2f
BB
1102 }
1103}
1104
b9541d6b
CW
1105/*
1106 * Transition between the two allocation states for the arc_buf_hdr struct.
1107 * The arc_buf_hdr struct can be allocated with (hdr_full_cache) or without
1108 * (hdr_l2only_cache) the fields necessary for the L1 cache - the smaller
1109 * version is used when a cache buffer is only in the L2ARC in order to reduce
1110 * memory usage.
1111 */
1112static arc_buf_hdr_t *
1113arc_hdr_realloc(arc_buf_hdr_t *hdr, kmem_cache_t *old, kmem_cache_t *new)
1114{
1115 arc_buf_hdr_t *nhdr;
1116 l2arc_dev_t *dev;
1117
1118 ASSERT(HDR_HAS_L2HDR(hdr));
1119 ASSERT((old == hdr_full_cache && new == hdr_l2only_cache) ||
1120 (old == hdr_l2only_cache && new == hdr_full_cache));
1121
1122 dev = hdr->b_l2hdr.b_dev;
1123 nhdr = kmem_cache_alloc(new, KM_PUSHPAGE);
1124
1125 ASSERT(MUTEX_HELD(HDR_LOCK(hdr)));
1126 buf_hash_remove(hdr);
1127
1128 bcopy(hdr, nhdr, HDR_L2ONLY_SIZE);
d962d5da 1129
b9541d6b
CW
1130 if (new == hdr_full_cache) {
1131 nhdr->b_flags |= ARC_FLAG_HAS_L1HDR;
1132 /*
1133 * arc_access and arc_change_state need to be aware that a
1134 * header has just come out of L2ARC, so we set its state to
1135 * l2c_only even though it's about to change.
1136 */
1137 nhdr->b_l1hdr.b_state = arc_l2c_only;
ca0bf58d
PS
1138
1139 /* Verify previous threads set to NULL before freeing */
1140 ASSERT3P(nhdr->b_l1hdr.b_tmp_cdata, ==, NULL);
b9541d6b
CW
1141 } else {
1142 ASSERT(hdr->b_l1hdr.b_buf == NULL);
1143 ASSERT0(hdr->b_l1hdr.b_datacnt);
ca0bf58d
PS
1144
1145 /*
1146 * If we've reached here, We must have been called from
1147 * arc_evict_hdr(), as such we should have already been
1148 * removed from any ghost list we were previously on
1149 * (which protects us from racing with arc_evict_state),
1150 * thus no locking is needed during this check.
1151 */
1152 ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node));
1153
b9541d6b 1154 /*
ca0bf58d
PS
1155 * A buffer must not be moved into the arc_l2c_only
1156 * state if it's not finished being written out to the
1157 * l2arc device. Otherwise, the b_l1hdr.b_tmp_cdata field
1158 * might try to be accessed, even though it was removed.
b9541d6b 1159 */
ca0bf58d
PS
1160 VERIFY(!HDR_L2_WRITING(hdr));
1161 VERIFY3P(hdr->b_l1hdr.b_tmp_cdata, ==, NULL);
1162
b9541d6b
CW
1163 nhdr->b_flags &= ~ARC_FLAG_HAS_L1HDR;
1164 }
1165 /*
1166 * The header has been reallocated so we need to re-insert it into any
1167 * lists it was on.
1168 */
1169 (void) buf_hash_insert(nhdr, NULL);
1170
1171 ASSERT(list_link_active(&hdr->b_l2hdr.b_l2node));
1172
1173 mutex_enter(&dev->l2ad_mtx);
1174
1175 /*
1176 * We must place the realloc'ed header back into the list at
1177 * the same spot. Otherwise, if it's placed earlier in the list,
1178 * l2arc_write_buffers() could find it during the function's
1179 * write phase, and try to write it out to the l2arc.
1180 */
1181 list_insert_after(&dev->l2ad_buflist, hdr, nhdr);
1182 list_remove(&dev->l2ad_buflist, hdr);
1183
1184 mutex_exit(&dev->l2ad_mtx);
1185
d962d5da
PS
1186 /*
1187 * Since we're using the pointer address as the tag when
1188 * incrementing and decrementing the l2ad_alloc refcount, we
1189 * must remove the old pointer (that we're about to destroy) and
1190 * add the new pointer to the refcount. Otherwise we'd remove
1191 * the wrong pointer address when calling arc_hdr_destroy() later.
1192 */
1193
1194 (void) refcount_remove_many(&dev->l2ad_alloc,
1195 hdr->b_l2hdr.b_asize, hdr);
1196
1197 (void) refcount_add_many(&dev->l2ad_alloc,
1198 nhdr->b_l2hdr.b_asize, nhdr);
1199
b9541d6b
CW
1200 buf_discard_identity(hdr);
1201 hdr->b_freeze_cksum = NULL;
1202 kmem_cache_free(old, hdr);
1203
1204 return (nhdr);
1205}
1206
1207
34dc7c2f
BB
1208#define ARC_MINTIME (hz>>4) /* 62 ms */
1209
1210static void
1211arc_cksum_verify(arc_buf_t *buf)
1212{
1213 zio_cksum_t zc;
1214
1215 if (!(zfs_flags & ZFS_DEBUG_MODIFY))
1216 return;
1217
b9541d6b
CW
1218 mutex_enter(&buf->b_hdr->b_l1hdr.b_freeze_lock);
1219 if (buf->b_hdr->b_freeze_cksum == NULL || HDR_IO_ERROR(buf->b_hdr)) {
1220 mutex_exit(&buf->b_hdr->b_l1hdr.b_freeze_lock);
34dc7c2f
BB
1221 return;
1222 }
1223 fletcher_2_native(buf->b_data, buf->b_hdr->b_size, &zc);
1224 if (!ZIO_CHECKSUM_EQUAL(*buf->b_hdr->b_freeze_cksum, zc))
1225 panic("buffer modified while frozen!");
b9541d6b 1226 mutex_exit(&buf->b_hdr->b_l1hdr.b_freeze_lock);
34dc7c2f
BB
1227}
1228
1229static int
1230arc_cksum_equal(arc_buf_t *buf)
1231{
1232 zio_cksum_t zc;
1233 int equal;
1234
b9541d6b 1235 mutex_enter(&buf->b_hdr->b_l1hdr.b_freeze_lock);
34dc7c2f
BB
1236 fletcher_2_native(buf->b_data, buf->b_hdr->b_size, &zc);
1237 equal = ZIO_CHECKSUM_EQUAL(*buf->b_hdr->b_freeze_cksum, zc);
b9541d6b 1238 mutex_exit(&buf->b_hdr->b_l1hdr.b_freeze_lock);
34dc7c2f
BB
1239
1240 return (equal);
1241}
1242
1243static void
1244arc_cksum_compute(arc_buf_t *buf, boolean_t force)
1245{
1246 if (!force && !(zfs_flags & ZFS_DEBUG_MODIFY))
1247 return;
1248
b9541d6b 1249 mutex_enter(&buf->b_hdr->b_l1hdr.b_freeze_lock);
34dc7c2f 1250 if (buf->b_hdr->b_freeze_cksum != NULL) {
b9541d6b 1251 mutex_exit(&buf->b_hdr->b_l1hdr.b_freeze_lock);
34dc7c2f
BB
1252 return;
1253 }
409dc1a5 1254 buf->b_hdr->b_freeze_cksum = kmem_alloc(sizeof (zio_cksum_t),
79c76d5b 1255 KM_SLEEP);
34dc7c2f
BB
1256 fletcher_2_native(buf->b_data, buf->b_hdr->b_size,
1257 buf->b_hdr->b_freeze_cksum);
b9541d6b 1258 mutex_exit(&buf->b_hdr->b_l1hdr.b_freeze_lock);
498877ba
MA
1259 arc_buf_watch(buf);
1260}
1261
1262#ifndef _KERNEL
1263void
1264arc_buf_sigsegv(int sig, siginfo_t *si, void *unused)
1265{
1266 panic("Got SIGSEGV at address: 0x%lx\n", (long) si->si_addr);
1267}
1268#endif
1269
1270/* ARGSUSED */
1271static void
1272arc_buf_unwatch(arc_buf_t *buf)
1273{
1274#ifndef _KERNEL
1275 if (arc_watch) {
1276 ASSERT0(mprotect(buf->b_data, buf->b_hdr->b_size,
1277 PROT_READ | PROT_WRITE));
1278 }
1279#endif
1280}
1281
1282/* ARGSUSED */
1283static void
1284arc_buf_watch(arc_buf_t *buf)
1285{
1286#ifndef _KERNEL
1287 if (arc_watch)
1288 ASSERT0(mprotect(buf->b_data, buf->b_hdr->b_size, PROT_READ));
1289#endif
34dc7c2f
BB
1290}
1291
b9541d6b
CW
1292static arc_buf_contents_t
1293arc_buf_type(arc_buf_hdr_t *hdr)
1294{
1295 if (HDR_ISTYPE_METADATA(hdr)) {
1296 return (ARC_BUFC_METADATA);
1297 } else {
1298 return (ARC_BUFC_DATA);
1299 }
1300}
1301
1302static uint32_t
1303arc_bufc_to_flags(arc_buf_contents_t type)
1304{
1305 switch (type) {
1306 case ARC_BUFC_DATA:
1307 /* metadata field is 0 if buffer contains normal data */
1308 return (0);
1309 case ARC_BUFC_METADATA:
1310 return (ARC_FLAG_BUFC_METADATA);
1311 default:
1312 break;
1313 }
1314 panic("undefined ARC buffer type!");
1315 return ((uint32_t)-1);
1316}
1317
34dc7c2f
BB
1318void
1319arc_buf_thaw(arc_buf_t *buf)
1320{
1321 if (zfs_flags & ZFS_DEBUG_MODIFY) {
b9541d6b 1322 if (buf->b_hdr->b_l1hdr.b_state != arc_anon)
34dc7c2f 1323 panic("modifying non-anon buffer!");
b9541d6b 1324 if (HDR_IO_IN_PROGRESS(buf->b_hdr))
34dc7c2f
BB
1325 panic("modifying buffer while i/o in progress!");
1326 arc_cksum_verify(buf);
1327 }
1328
b9541d6b 1329 mutex_enter(&buf->b_hdr->b_l1hdr.b_freeze_lock);
34dc7c2f
BB
1330 if (buf->b_hdr->b_freeze_cksum != NULL) {
1331 kmem_free(buf->b_hdr->b_freeze_cksum, sizeof (zio_cksum_t));
1332 buf->b_hdr->b_freeze_cksum = NULL;
1333 }
428870ff 1334
b9541d6b 1335 mutex_exit(&buf->b_hdr->b_l1hdr.b_freeze_lock);
498877ba
MA
1336
1337 arc_buf_unwatch(buf);
34dc7c2f
BB
1338}
1339
1340void
1341arc_buf_freeze(arc_buf_t *buf)
1342{
428870ff
BB
1343 kmutex_t *hash_lock;
1344
34dc7c2f
BB
1345 if (!(zfs_flags & ZFS_DEBUG_MODIFY))
1346 return;
1347
428870ff
BB
1348 hash_lock = HDR_LOCK(buf->b_hdr);
1349 mutex_enter(hash_lock);
1350
34dc7c2f 1351 ASSERT(buf->b_hdr->b_freeze_cksum != NULL ||
b9541d6b 1352 buf->b_hdr->b_l1hdr.b_state == arc_anon);
34dc7c2f 1353 arc_cksum_compute(buf, B_FALSE);
428870ff 1354 mutex_exit(hash_lock);
498877ba 1355
34dc7c2f
BB
1356}
1357
1358static void
2a432414 1359add_reference(arc_buf_hdr_t *hdr, kmutex_t *hash_lock, void *tag)
34dc7c2f 1360{
b9541d6b
CW
1361 arc_state_t *state;
1362
1363 ASSERT(HDR_HAS_L1HDR(hdr));
34dc7c2f
BB
1364 ASSERT(MUTEX_HELD(hash_lock));
1365
b9541d6b
CW
1366 state = hdr->b_l1hdr.b_state;
1367
1368 if ((refcount_add(&hdr->b_l1hdr.b_refcnt, tag) == 1) &&
1369 (state != arc_anon)) {
1370 /* We don't use the L2-only state list. */
1371 if (state != arc_l2c_only) {
ca0bf58d 1372 arc_buf_contents_t type = arc_buf_type(hdr);
b9541d6b 1373 uint64_t delta = hdr->b_size * hdr->b_l1hdr.b_datacnt;
ca0bf58d
PS
1374 multilist_t *list = &state->arcs_list[type];
1375 uint64_t *size = &state->arcs_lsize[type];
1376
1377 multilist_remove(list, hdr);
b9541d6b 1378
b9541d6b
CW
1379 if (GHOST_STATE(state)) {
1380 ASSERT0(hdr->b_l1hdr.b_datacnt);
1381 ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL);
1382 delta = hdr->b_size;
1383 }
1384 ASSERT(delta > 0);
1385 ASSERT3U(*size, >=, delta);
1386 atomic_add_64(size, -delta);
34dc7c2f 1387 }
b128c09f 1388 /* remove the prefetch flag if we get a reference */
b9541d6b 1389 hdr->b_flags &= ~ARC_FLAG_PREFETCH;
34dc7c2f
BB
1390 }
1391}
1392
1393static int
2a432414 1394remove_reference(arc_buf_hdr_t *hdr, kmutex_t *hash_lock, void *tag)
34dc7c2f
BB
1395{
1396 int cnt;
b9541d6b 1397 arc_state_t *state = hdr->b_l1hdr.b_state;
34dc7c2f 1398
b9541d6b 1399 ASSERT(HDR_HAS_L1HDR(hdr));
34dc7c2f
BB
1400 ASSERT(state == arc_anon || MUTEX_HELD(hash_lock));
1401 ASSERT(!GHOST_STATE(state));
1402
b9541d6b
CW
1403 /*
1404 * arc_l2c_only counts as a ghost state so we don't need to explicitly
1405 * check to prevent usage of the arc_l2c_only list.
1406 */
1407 if (((cnt = refcount_remove(&hdr->b_l1hdr.b_refcnt, tag)) == 0) &&
34dc7c2f 1408 (state != arc_anon)) {
ca0bf58d
PS
1409 arc_buf_contents_t type = arc_buf_type(hdr);
1410 multilist_t *list = &state->arcs_list[type];
1411 uint64_t *size = &state->arcs_lsize[type];
1412
1413 multilist_insert(list, hdr);
34dc7c2f 1414
b9541d6b
CW
1415 ASSERT(hdr->b_l1hdr.b_datacnt > 0);
1416 atomic_add_64(size, hdr->b_size *
1417 hdr->b_l1hdr.b_datacnt);
34dc7c2f
BB
1418 }
1419 return (cnt);
1420}
1421
e0b0ca98
BB
1422/*
1423 * Returns detailed information about a specific arc buffer. When the
1424 * state_index argument is set the function will calculate the arc header
1425 * list position for its arc state. Since this requires a linear traversal
1426 * callers are strongly encourage not to do this. However, it can be helpful
1427 * for targeted analysis so the functionality is provided.
1428 */
1429void
1430arc_buf_info(arc_buf_t *ab, arc_buf_info_t *abi, int state_index)
1431{
1432 arc_buf_hdr_t *hdr = ab->b_hdr;
b9541d6b
CW
1433 l1arc_buf_hdr_t *l1hdr = NULL;
1434 l2arc_buf_hdr_t *l2hdr = NULL;
1435 arc_state_t *state = NULL;
1436
1437 if (HDR_HAS_L1HDR(hdr)) {
1438 l1hdr = &hdr->b_l1hdr;
1439 state = l1hdr->b_state;
1440 }
1441 if (HDR_HAS_L2HDR(hdr))
1442 l2hdr = &hdr->b_l2hdr;
e0b0ca98 1443
d1d7e268 1444 memset(abi, 0, sizeof (arc_buf_info_t));
e0b0ca98 1445 abi->abi_flags = hdr->b_flags;
b9541d6b
CW
1446
1447 if (l1hdr) {
1448 abi->abi_datacnt = l1hdr->b_datacnt;
1449 abi->abi_access = l1hdr->b_arc_access;
1450 abi->abi_mru_hits = l1hdr->b_mru_hits;
1451 abi->abi_mru_ghost_hits = l1hdr->b_mru_ghost_hits;
1452 abi->abi_mfu_hits = l1hdr->b_mfu_hits;
1453 abi->abi_mfu_ghost_hits = l1hdr->b_mfu_ghost_hits;
1454 abi->abi_holds = refcount_count(&l1hdr->b_refcnt);
1455 }
1456
1457 if (l2hdr) {
1458 abi->abi_l2arc_dattr = l2hdr->b_daddr;
1459 abi->abi_l2arc_asize = l2hdr->b_asize;
1460 abi->abi_l2arc_compress = HDR_GET_COMPRESS(hdr);
1461 abi->abi_l2arc_hits = l2hdr->b_hits;
1462 }
1463
e0b0ca98 1464 abi->abi_state_type = state ? state->arcs_state : ARC_STATE_ANON;
b9541d6b 1465 abi->abi_state_contents = arc_buf_type(hdr);
e0b0ca98 1466 abi->abi_size = hdr->b_size;
e0b0ca98
BB
1467}
1468
34dc7c2f 1469/*
ca0bf58d 1470 * Move the supplied buffer to the indicated state. The hash lock
34dc7c2f
BB
1471 * for the buffer must be held by the caller.
1472 */
1473static void
2a432414
GW
1474arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr,
1475 kmutex_t *hash_lock)
34dc7c2f 1476{
b9541d6b
CW
1477 arc_state_t *old_state;
1478 int64_t refcnt;
1479 uint32_t datacnt;
34dc7c2f 1480 uint64_t from_delta, to_delta;
b9541d6b
CW
1481 arc_buf_contents_t buftype = arc_buf_type(hdr);
1482
1483 /*
1484 * We almost always have an L1 hdr here, since we call arc_hdr_realloc()
1485 * in arc_read() when bringing a buffer out of the L2ARC. However, the
1486 * L1 hdr doesn't always exist when we change state to arc_anon before
1487 * destroying a header, in which case reallocating to add the L1 hdr is
1488 * pointless.
1489 */
1490 if (HDR_HAS_L1HDR(hdr)) {
1491 old_state = hdr->b_l1hdr.b_state;
1492 refcnt = refcount_count(&hdr->b_l1hdr.b_refcnt);
1493 datacnt = hdr->b_l1hdr.b_datacnt;
1494 } else {
1495 old_state = arc_l2c_only;
1496 refcnt = 0;
1497 datacnt = 0;
1498 }
34dc7c2f
BB
1499
1500 ASSERT(MUTEX_HELD(hash_lock));
e8b96c60 1501 ASSERT3P(new_state, !=, old_state);
b9541d6b
CW
1502 ASSERT(refcnt == 0 || datacnt > 0);
1503 ASSERT(!GHOST_STATE(new_state) || datacnt == 0);
1504 ASSERT(old_state != arc_anon || datacnt <= 1);
34dc7c2f 1505
b9541d6b 1506 from_delta = to_delta = datacnt * hdr->b_size;
34dc7c2f
BB
1507
1508 /*
1509 * If this buffer is evictable, transfer it from the
1510 * old state list to the new state list.
1511 */
1512 if (refcnt == 0) {
b9541d6b 1513 if (old_state != arc_anon && old_state != arc_l2c_only) {
b9541d6b 1514 uint64_t *size = &old_state->arcs_lsize[buftype];
34dc7c2f 1515
b9541d6b 1516 ASSERT(HDR_HAS_L1HDR(hdr));
ca0bf58d 1517 multilist_remove(&old_state->arcs_list[buftype], hdr);
34dc7c2f
BB
1518
1519 /*
1520 * If prefetching out of the ghost cache,
428870ff 1521 * we will have a non-zero datacnt.
34dc7c2f 1522 */
b9541d6b 1523 if (GHOST_STATE(old_state) && datacnt == 0) {
34dc7c2f 1524 /* ghost elements have a ghost size */
b9541d6b 1525 ASSERT(hdr->b_l1hdr.b_buf == NULL);
2a432414 1526 from_delta = hdr->b_size;
34dc7c2f
BB
1527 }
1528 ASSERT3U(*size, >=, from_delta);
1529 atomic_add_64(size, -from_delta);
34dc7c2f 1530 }
b9541d6b 1531 if (new_state != arc_anon && new_state != arc_l2c_only) {
b9541d6b 1532 uint64_t *size = &new_state->arcs_lsize[buftype];
34dc7c2f 1533
b9541d6b
CW
1534 /*
1535 * An L1 header always exists here, since if we're
1536 * moving to some L1-cached state (i.e. not l2c_only or
1537 * anonymous), we realloc the header to add an L1hdr
1538 * beforehand.
1539 */
1540 ASSERT(HDR_HAS_L1HDR(hdr));
ca0bf58d 1541 multilist_insert(&new_state->arcs_list[buftype], hdr);
34dc7c2f
BB
1542
1543 /* ghost elements have a ghost size */
1544 if (GHOST_STATE(new_state)) {
b9541d6b
CW
1545 ASSERT0(datacnt);
1546 ASSERT(hdr->b_l1hdr.b_buf == NULL);
2a432414 1547 to_delta = hdr->b_size;
34dc7c2f
BB
1548 }
1549 atomic_add_64(size, to_delta);
34dc7c2f
BB
1550 }
1551 }
1552
2a432414
GW
1553 ASSERT(!BUF_EMPTY(hdr));
1554 if (new_state == arc_anon && HDR_IN_HASH_TABLE(hdr))
1555 buf_hash_remove(hdr);
34dc7c2f 1556
b9541d6b 1557 /* adjust state sizes (ignore arc_l2c_only) */
36da08ef
PS
1558
1559 if (to_delta && new_state != arc_l2c_only) {
1560 ASSERT(HDR_HAS_L1HDR(hdr));
1561 if (GHOST_STATE(new_state)) {
1562 ASSERT0(datacnt);
1563
1564 /*
1565 * We moving a header to a ghost state, we first
1566 * remove all arc buffers. Thus, we'll have a
1567 * datacnt of zero, and no arc buffer to use for
1568 * the reference. As a result, we use the arc
1569 * header pointer for the reference.
1570 */
1571 (void) refcount_add_many(&new_state->arcs_size,
1572 hdr->b_size, hdr);
1573 } else {
1574 arc_buf_t *buf;
1575 ASSERT3U(datacnt, !=, 0);
1576
1577 /*
1578 * Each individual buffer holds a unique reference,
1579 * thus we must remove each of these references one
1580 * at a time.
1581 */
1582 for (buf = hdr->b_l1hdr.b_buf; buf != NULL;
1583 buf = buf->b_next) {
1584 (void) refcount_add_many(&new_state->arcs_size,
1585 hdr->b_size, buf);
1586 }
1587 }
1588 }
1589
b9541d6b 1590 if (from_delta && old_state != arc_l2c_only) {
36da08ef
PS
1591 ASSERT(HDR_HAS_L1HDR(hdr));
1592 if (GHOST_STATE(old_state)) {
1593 /*
1594 * When moving a header off of a ghost state,
1595 * there's the possibility for datacnt to be
1596 * non-zero. This is because we first add the
1597 * arc buffer to the header prior to changing
1598 * the header's state. Since we used the header
1599 * for the reference when putting the header on
1600 * the ghost state, we must balance that and use
1601 * the header when removing off the ghost state
1602 * (even though datacnt is non zero).
1603 */
1604
1605 IMPLY(datacnt == 0, new_state == arc_anon ||
1606 new_state == arc_l2c_only);
1607
1608 (void) refcount_remove_many(&old_state->arcs_size,
1609 hdr->b_size, hdr);
1610 } else {
1611 arc_buf_t *buf;
1612 ASSERT3U(datacnt, !=, 0);
1613
1614 /*
1615 * Each individual buffer holds a unique reference,
1616 * thus we must remove each of these references one
1617 * at a time.
1618 */
1619 for (buf = hdr->b_l1hdr.b_buf; buf != NULL;
1620 buf = buf->b_next) {
1621 (void) refcount_remove_many(
1622 &old_state->arcs_size, hdr->b_size, buf);
1623 }
1624 }
34dc7c2f 1625 }
36da08ef 1626
b9541d6b
CW
1627 if (HDR_HAS_L1HDR(hdr))
1628 hdr->b_l1hdr.b_state = new_state;
34dc7c2f 1629
b9541d6b
CW
1630 /*
1631 * L2 headers should never be on the L2 state list since they don't
1632 * have L1 headers allocated.
1633 */
ca0bf58d
PS
1634 ASSERT(multilist_is_empty(&arc_l2c_only->arcs_list[ARC_BUFC_DATA]) &&
1635 multilist_is_empty(&arc_l2c_only->arcs_list[ARC_BUFC_METADATA]));
34dc7c2f
BB
1636}
1637
1638void
d164b209 1639arc_space_consume(uint64_t space, arc_space_type_t type)
34dc7c2f 1640{
d164b209
BB
1641 ASSERT(type >= 0 && type < ARC_SPACE_NUMTYPES);
1642
1643 switch (type) {
e75c13c3
BB
1644 default:
1645 break;
d164b209
BB
1646 case ARC_SPACE_DATA:
1647 ARCSTAT_INCR(arcstat_data_size, space);
1648 break;
cc7f677c 1649 case ARC_SPACE_META:
500445c0 1650 ARCSTAT_INCR(arcstat_metadata_size, space);
cc7f677c 1651 break;
d164b209
BB
1652 case ARC_SPACE_OTHER:
1653 ARCSTAT_INCR(arcstat_other_size, space);
1654 break;
1655 case ARC_SPACE_HDRS:
1656 ARCSTAT_INCR(arcstat_hdr_size, space);
1657 break;
1658 case ARC_SPACE_L2HDRS:
1659 ARCSTAT_INCR(arcstat_l2_hdr_size, space);
1660 break;
1661 }
1662
500445c0 1663 if (type != ARC_SPACE_DATA)
cc7f677c
PS
1664 ARCSTAT_INCR(arcstat_meta_used, space);
1665
34dc7c2f
BB
1666 atomic_add_64(&arc_size, space);
1667}
1668
1669void
d164b209 1670arc_space_return(uint64_t space, arc_space_type_t type)
34dc7c2f 1671{
d164b209
BB
1672 ASSERT(type >= 0 && type < ARC_SPACE_NUMTYPES);
1673
1674 switch (type) {
e75c13c3
BB
1675 default:
1676 break;
d164b209
BB
1677 case ARC_SPACE_DATA:
1678 ARCSTAT_INCR(arcstat_data_size, -space);
1679 break;
cc7f677c 1680 case ARC_SPACE_META:
500445c0 1681 ARCSTAT_INCR(arcstat_metadata_size, -space);
cc7f677c 1682 break;
d164b209
BB
1683 case ARC_SPACE_OTHER:
1684 ARCSTAT_INCR(arcstat_other_size, -space);
1685 break;
1686 case ARC_SPACE_HDRS:
1687 ARCSTAT_INCR(arcstat_hdr_size, -space);
1688 break;
1689 case ARC_SPACE_L2HDRS:
1690 ARCSTAT_INCR(arcstat_l2_hdr_size, -space);
1691 break;
1692 }
1693
cc7f677c
PS
1694 if (type != ARC_SPACE_DATA) {
1695 ASSERT(arc_meta_used >= space);
500445c0
PS
1696 if (arc_meta_max < arc_meta_used)
1697 arc_meta_max = arc_meta_used;
cc7f677c
PS
1698 ARCSTAT_INCR(arcstat_meta_used, -space);
1699 }
1700
34dc7c2f
BB
1701 ASSERT(arc_size >= space);
1702 atomic_add_64(&arc_size, -space);
1703}
1704
34dc7c2f 1705arc_buf_t *
5f6d0b6f 1706arc_buf_alloc(spa_t *spa, uint64_t size, void *tag, arc_buf_contents_t type)
34dc7c2f
BB
1707{
1708 arc_buf_hdr_t *hdr;
1709 arc_buf_t *buf;
1710
f1512ee6 1711 VERIFY3U(size, <=, spa_maxblocksize(spa));
b9541d6b 1712 hdr = kmem_cache_alloc(hdr_full_cache, KM_PUSHPAGE);
34dc7c2f 1713 ASSERT(BUF_EMPTY(hdr));
b9541d6b 1714 ASSERT3P(hdr->b_freeze_cksum, ==, NULL);
34dc7c2f 1715 hdr->b_size = size;
3541dc6d 1716 hdr->b_spa = spa_load_guid(spa);
b9541d6b
CW
1717 hdr->b_l1hdr.b_mru_hits = 0;
1718 hdr->b_l1hdr.b_mru_ghost_hits = 0;
1719 hdr->b_l1hdr.b_mfu_hits = 0;
1720 hdr->b_l1hdr.b_mfu_ghost_hits = 0;
1721 hdr->b_l1hdr.b_l2_hits = 0;
1722
34dc7c2f
BB
1723 buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE);
1724 buf->b_hdr = hdr;
1725 buf->b_data = NULL;
1726 buf->b_efunc = NULL;
1727 buf->b_private = NULL;
1728 buf->b_next = NULL;
b9541d6b
CW
1729
1730 hdr->b_flags = arc_bufc_to_flags(type);
1731 hdr->b_flags |= ARC_FLAG_HAS_L1HDR;
1732
1733 hdr->b_l1hdr.b_buf = buf;
1734 hdr->b_l1hdr.b_state = arc_anon;
1735 hdr->b_l1hdr.b_arc_access = 0;
1736 hdr->b_l1hdr.b_datacnt = 1;
ca0bf58d 1737 hdr->b_l1hdr.b_tmp_cdata = NULL;
b9541d6b 1738
34dc7c2f 1739 arc_get_data_buf(buf);
b9541d6b
CW
1740
1741 ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
1742 (void) refcount_add(&hdr->b_l1hdr.b_refcnt, tag);
34dc7c2f
BB
1743
1744 return (buf);
1745}
1746
9babb374
BB
1747static char *arc_onloan_tag = "onloan";
1748
1749/*
1750 * Loan out an anonymous arc buffer. Loaned buffers are not counted as in
1751 * flight data by arc_tempreserve_space() until they are "returned". Loaned
1752 * buffers must be returned to the arc before they can be used by the DMU or
1753 * freed.
1754 */
1755arc_buf_t *
5f6d0b6f 1756arc_loan_buf(spa_t *spa, uint64_t size)
9babb374
BB
1757{
1758 arc_buf_t *buf;
1759
1760 buf = arc_buf_alloc(spa, size, arc_onloan_tag, ARC_BUFC_DATA);
1761
1762 atomic_add_64(&arc_loaned_bytes, size);
1763 return (buf);
1764}
1765
1766/*
1767 * Return a loaned arc buffer to the arc.
1768 */
1769void
1770arc_return_buf(arc_buf_t *buf, void *tag)
1771{
1772 arc_buf_hdr_t *hdr = buf->b_hdr;
1773
9babb374 1774 ASSERT(buf->b_data != NULL);
b9541d6b
CW
1775 ASSERT(HDR_HAS_L1HDR(hdr));
1776 (void) refcount_add(&hdr->b_l1hdr.b_refcnt, tag);
1777 (void) refcount_remove(&hdr->b_l1hdr.b_refcnt, arc_onloan_tag);
9babb374
BB
1778
1779 atomic_add_64(&arc_loaned_bytes, -hdr->b_size);
1780}
1781
428870ff
BB
1782/* Detach an arc_buf from a dbuf (tag) */
1783void
1784arc_loan_inuse_buf(arc_buf_t *buf, void *tag)
1785{
b9541d6b 1786 arc_buf_hdr_t *hdr = buf->b_hdr;
428870ff
BB
1787
1788 ASSERT(buf->b_data != NULL);
b9541d6b
CW
1789 ASSERT(HDR_HAS_L1HDR(hdr));
1790 (void) refcount_add(&hdr->b_l1hdr.b_refcnt, arc_onloan_tag);
1791 (void) refcount_remove(&hdr->b_l1hdr.b_refcnt, tag);
428870ff
BB
1792 buf->b_efunc = NULL;
1793 buf->b_private = NULL;
1794
1795 atomic_add_64(&arc_loaned_bytes, hdr->b_size);
1796}
1797
34dc7c2f
BB
1798static arc_buf_t *
1799arc_buf_clone(arc_buf_t *from)
1800{
1801 arc_buf_t *buf;
1802 arc_buf_hdr_t *hdr = from->b_hdr;
1803 uint64_t size = hdr->b_size;
1804
b9541d6b
CW
1805 ASSERT(HDR_HAS_L1HDR(hdr));
1806 ASSERT(hdr->b_l1hdr.b_state != arc_anon);
428870ff 1807
34dc7c2f
BB
1808 buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE);
1809 buf->b_hdr = hdr;
1810 buf->b_data = NULL;
1811 buf->b_efunc = NULL;
1812 buf->b_private = NULL;
b9541d6b
CW
1813 buf->b_next = hdr->b_l1hdr.b_buf;
1814 hdr->b_l1hdr.b_buf = buf;
34dc7c2f
BB
1815 arc_get_data_buf(buf);
1816 bcopy(from->b_data, buf->b_data, size);
1eb5bfa3
GW
1817
1818 /*
1819 * This buffer already exists in the arc so create a duplicate
1820 * copy for the caller. If the buffer is associated with user data
1821 * then track the size and number of duplicates. These stats will be
1822 * updated as duplicate buffers are created and destroyed.
1823 */
b9541d6b 1824 if (HDR_ISTYPE_DATA(hdr)) {
1eb5bfa3
GW
1825 ARCSTAT_BUMP(arcstat_duplicate_buffers);
1826 ARCSTAT_INCR(arcstat_duplicate_buffers_size, size);
1827 }
b9541d6b 1828 hdr->b_l1hdr.b_datacnt += 1;
34dc7c2f
BB
1829 return (buf);
1830}
1831
1832void
1833arc_buf_add_ref(arc_buf_t *buf, void* tag)
1834{
1835 arc_buf_hdr_t *hdr;
1836 kmutex_t *hash_lock;
1837
1838 /*
b128c09f
BB
1839 * Check to see if this buffer is evicted. Callers
1840 * must verify b_data != NULL to know if the add_ref
1841 * was successful.
34dc7c2f 1842 */
428870ff 1843 mutex_enter(&buf->b_evict_lock);
b128c09f 1844 if (buf->b_data == NULL) {
428870ff 1845 mutex_exit(&buf->b_evict_lock);
34dc7c2f
BB
1846 return;
1847 }
428870ff 1848 hash_lock = HDR_LOCK(buf->b_hdr);
34dc7c2f 1849 mutex_enter(hash_lock);
428870ff 1850 hdr = buf->b_hdr;
b9541d6b 1851 ASSERT(HDR_HAS_L1HDR(hdr));
428870ff
BB
1852 ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
1853 mutex_exit(&buf->b_evict_lock);
34dc7c2f 1854
b9541d6b
CW
1855 ASSERT(hdr->b_l1hdr.b_state == arc_mru ||
1856 hdr->b_l1hdr.b_state == arc_mfu);
1857
34dc7c2f 1858 add_reference(hdr, hash_lock, tag);
d164b209 1859 DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr);
34dc7c2f
BB
1860 arc_access(hdr, hash_lock);
1861 mutex_exit(hash_lock);
1862 ARCSTAT_BUMP(arcstat_hits);
b9541d6b
CW
1863 ARCSTAT_CONDSTAT(!HDR_PREFETCH(hdr),
1864 demand, prefetch, !HDR_ISTYPE_METADATA(hdr),
34dc7c2f
BB
1865 data, metadata, hits);
1866}
1867
ca0bf58d
PS
1868static void
1869arc_buf_free_on_write(void *data, size_t size,
1870 void (*free_func)(void *, size_t))
1871{
1872 l2arc_data_free_t *df;
1873
1874 df = kmem_alloc(sizeof (*df), KM_SLEEP);
1875 df->l2df_data = data;
1876 df->l2df_size = size;
1877 df->l2df_func = free_func;
1878 mutex_enter(&l2arc_free_on_write_mtx);
1879 list_insert_head(l2arc_free_on_write, df);
1880 mutex_exit(&l2arc_free_on_write_mtx);
1881}
1882
34dc7c2f
BB
1883/*
1884 * Free the arc data buffer. If it is an l2arc write in progress,
1885 * the buffer is placed on l2arc_free_on_write to be freed later.
1886 */
1887static void
498877ba 1888arc_buf_data_free(arc_buf_t *buf, void (*free_func)(void *, size_t))
34dc7c2f 1889{
498877ba
MA
1890 arc_buf_hdr_t *hdr = buf->b_hdr;
1891
34dc7c2f 1892 if (HDR_L2_WRITING(hdr)) {
ca0bf58d 1893 arc_buf_free_on_write(buf->b_data, hdr->b_size, free_func);
34dc7c2f
BB
1894 ARCSTAT_BUMP(arcstat_l2_free_on_write);
1895 } else {
498877ba 1896 free_func(buf->b_data, hdr->b_size);
34dc7c2f
BB
1897 }
1898}
1899
ca0bf58d
PS
1900static void
1901arc_buf_l2_cdata_free(arc_buf_hdr_t *hdr)
1902{
1903 ASSERT(HDR_HAS_L2HDR(hdr));
1904 ASSERT(MUTEX_HELD(&hdr->b_l2hdr.b_dev->l2ad_mtx));
1905
1906 /*
1907 * The b_tmp_cdata field is linked off of the b_l1hdr, so if
1908 * that doesn't exist, the header is in the arc_l2c_only state,
1909 * and there isn't anything to free (it's already been freed).
1910 */
1911 if (!HDR_HAS_L1HDR(hdr))
1912 return;
1913
1914 /*
1915 * The header isn't being written to the l2arc device, thus it
1916 * shouldn't have a b_tmp_cdata to free.
1917 */
1918 if (!HDR_L2_WRITING(hdr)) {
1919 ASSERT3P(hdr->b_l1hdr.b_tmp_cdata, ==, NULL);
1920 return;
1921 }
1922
1923 /*
1924 * The header does not have compression enabled. This can be due
1925 * to the buffer not being compressible, or because we're
1926 * freeing the buffer before the second phase of
1927 * l2arc_write_buffer() has started (which does the compression
1928 * step). In either case, b_tmp_cdata does not point to a
1929 * separately compressed buffer, so there's nothing to free (it
1930 * points to the same buffer as the arc_buf_t's b_data field).
1931 */
1932 if (HDR_GET_COMPRESS(hdr) == ZIO_COMPRESS_OFF) {
1933 hdr->b_l1hdr.b_tmp_cdata = NULL;
1934 return;
1935 }
1936
1937 /*
1938 * There's nothing to free since the buffer was all zero's and
1939 * compressed to a zero length buffer.
1940 */
1941 if (HDR_GET_COMPRESS(hdr) == ZIO_COMPRESS_EMPTY) {
1942 ASSERT3P(hdr->b_l1hdr.b_tmp_cdata, ==, NULL);
1943 return;
1944 }
1945
1946 ASSERT(L2ARC_IS_VALID_COMPRESS(HDR_GET_COMPRESS(hdr)));
1947
1948 arc_buf_free_on_write(hdr->b_l1hdr.b_tmp_cdata,
1949 hdr->b_size, zio_data_buf_free);
1950
1951 ARCSTAT_BUMP(arcstat_l2_cdata_free_on_write);
1952 hdr->b_l1hdr.b_tmp_cdata = NULL;
1953}
1954
bd089c54
MA
1955/*
1956 * Free up buf->b_data and if 'remove' is set, then pull the
1957 * arc_buf_t off of the the arc_buf_hdr_t's list and free it.
1958 */
34dc7c2f 1959static void
ca0bf58d 1960arc_buf_destroy(arc_buf_t *buf, boolean_t remove)
34dc7c2f
BB
1961{
1962 arc_buf_t **bufp;
1963
1964 /* free up data associated with the buf */
b9541d6b
CW
1965 if (buf->b_data != NULL) {
1966 arc_state_t *state = buf->b_hdr->b_l1hdr.b_state;
34dc7c2f 1967 uint64_t size = buf->b_hdr->b_size;
b9541d6b 1968 arc_buf_contents_t type = arc_buf_type(buf->b_hdr);
34dc7c2f
BB
1969
1970 arc_cksum_verify(buf);
498877ba 1971 arc_buf_unwatch(buf);
428870ff 1972
ca0bf58d
PS
1973 if (type == ARC_BUFC_METADATA) {
1974 arc_buf_data_free(buf, zio_buf_free);
1975 arc_space_return(size, ARC_SPACE_META);
1976 } else {
1977 ASSERT(type == ARC_BUFC_DATA);
1978 arc_buf_data_free(buf, zio_data_buf_free);
1979 arc_space_return(size, ARC_SPACE_DATA);
34dc7c2f 1980 }
ca0bf58d
PS
1981
1982 /* protected by hash lock, if in the hash table */
1983 if (multilist_link_active(&buf->b_hdr->b_l1hdr.b_arc_node)) {
34dc7c2f
BB
1984 uint64_t *cnt = &state->arcs_lsize[type];
1985
b9541d6b
CW
1986 ASSERT(refcount_is_zero(
1987 &buf->b_hdr->b_l1hdr.b_refcnt));
1988 ASSERT(state != arc_anon && state != arc_l2c_only);
34dc7c2f
BB
1989
1990 ASSERT3U(*cnt, >=, size);
1991 atomic_add_64(cnt, -size);
1992 }
36da08ef
PS
1993
1994 (void) refcount_remove_many(&state->arcs_size, size, buf);
34dc7c2f 1995 buf->b_data = NULL;
1eb5bfa3
GW
1996
1997 /*
1998 * If we're destroying a duplicate buffer make sure
1999 * that the appropriate statistics are updated.
2000 */
b9541d6b
CW
2001 if (buf->b_hdr->b_l1hdr.b_datacnt > 1 &&
2002 HDR_ISTYPE_DATA(buf->b_hdr)) {
1eb5bfa3
GW
2003 ARCSTAT_BUMPDOWN(arcstat_duplicate_buffers);
2004 ARCSTAT_INCR(arcstat_duplicate_buffers_size, -size);
2005 }
b9541d6b
CW
2006 ASSERT(buf->b_hdr->b_l1hdr.b_datacnt > 0);
2007 buf->b_hdr->b_l1hdr.b_datacnt -= 1;
34dc7c2f
BB
2008 }
2009
2010 /* only remove the buf if requested */
bd089c54 2011 if (!remove)
34dc7c2f
BB
2012 return;
2013
2014 /* remove the buf from the hdr list */
b9541d6b
CW
2015 for (bufp = &buf->b_hdr->b_l1hdr.b_buf; *bufp != buf;
2016 bufp = &(*bufp)->b_next)
34dc7c2f
BB
2017 continue;
2018 *bufp = buf->b_next;
428870ff 2019 buf->b_next = NULL;
34dc7c2f
BB
2020
2021 ASSERT(buf->b_efunc == NULL);
2022
2023 /* clean up the buf */
2024 buf->b_hdr = NULL;
2025 kmem_cache_free(buf_cache, buf);
2026}
2027
d962d5da
PS
2028static void
2029arc_hdr_l2hdr_destroy(arc_buf_hdr_t *hdr)
2030{
2031 l2arc_buf_hdr_t *l2hdr = &hdr->b_l2hdr;
2032 l2arc_dev_t *dev = l2hdr->b_dev;
2033
2034 ASSERT(MUTEX_HELD(&dev->l2ad_mtx));
2035 ASSERT(HDR_HAS_L2HDR(hdr));
2036
2037 list_remove(&dev->l2ad_buflist, hdr);
2038
2039 arc_space_return(HDR_L2ONLY_SIZE, ARC_SPACE_L2HDRS);
2040
2041 /*
2042 * We don't want to leak the b_tmp_cdata buffer that was
2043 * allocated in l2arc_write_buffers()
2044 */
2045 arc_buf_l2_cdata_free(hdr);
2046
2047 /*
2048 * If the l2hdr's b_daddr is equal to L2ARC_ADDR_UNSET, then
2049 * this header is being processed by l2arc_write_buffers() (i.e.
2050 * it's in the first stage of l2arc_write_buffers()).
2051 * Re-affirming that truth here, just to serve as a reminder. If
2052 * b_daddr does not equal L2ARC_ADDR_UNSET, then the header may or
2053 * may not have its HDR_L2_WRITING flag set. (the write may have
2054 * completed, in which case HDR_L2_WRITING will be false and the
2055 * b_daddr field will point to the address of the buffer on disk).
2056 */
2057 IMPLY(l2hdr->b_daddr == L2ARC_ADDR_UNSET, HDR_L2_WRITING(hdr));
2058
2059 /*
2060 * If b_daddr is equal to L2ARC_ADDR_UNSET, we're racing with
2061 * l2arc_write_buffers(). Since we've just removed this header
2062 * from the l2arc buffer list, this header will never reach the
2063 * second stage of l2arc_write_buffers(), which increments the
2064 * accounting stats for this header. Thus, we must be careful
2065 * not to decrement them for this header either.
2066 */
2067 if (l2hdr->b_daddr != L2ARC_ADDR_UNSET) {
2068 ARCSTAT_INCR(arcstat_l2_asize, -l2hdr->b_asize);
2069 ARCSTAT_INCR(arcstat_l2_size, -hdr->b_size);
2070
2071 vdev_space_update(dev->l2ad_vdev,
2072 -l2hdr->b_asize, 0, 0);
2073
2074 (void) refcount_remove_many(&dev->l2ad_alloc,
2075 l2hdr->b_asize, hdr);
2076 }
2077
2078 hdr->b_flags &= ~ARC_FLAG_HAS_L2HDR;
2079}
2080
34dc7c2f
BB
2081static void
2082arc_hdr_destroy(arc_buf_hdr_t *hdr)
2083{
b9541d6b
CW
2084 if (HDR_HAS_L1HDR(hdr)) {
2085 ASSERT(hdr->b_l1hdr.b_buf == NULL ||
2086 hdr->b_l1hdr.b_datacnt > 0);
2087 ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
2088 ASSERT3P(hdr->b_l1hdr.b_state, ==, arc_anon);
2089 }
34dc7c2f 2090 ASSERT(!HDR_IO_IN_PROGRESS(hdr));
b9541d6b
CW
2091 ASSERT(!HDR_IN_HASH_TABLE(hdr));
2092
2093 if (HDR_HAS_L2HDR(hdr)) {
d962d5da
PS
2094 l2arc_dev_t *dev = hdr->b_l2hdr.b_dev;
2095 boolean_t buflist_held = MUTEX_HELD(&dev->l2ad_mtx);
428870ff 2096
d962d5da
PS
2097 if (!buflist_held)
2098 mutex_enter(&dev->l2ad_mtx);
b9541d6b 2099
ca0bf58d 2100 /*
d962d5da
PS
2101 * Even though we checked this conditional above, we
2102 * need to check this again now that we have the
2103 * l2ad_mtx. This is because we could be racing with
2104 * another thread calling l2arc_evict() which might have
2105 * destroyed this header's L2 portion as we were waiting
2106 * to acquire the l2ad_mtx. If that happens, we don't
2107 * want to re-destroy the header's L2 portion.
ca0bf58d 2108 */
d962d5da
PS
2109 if (HDR_HAS_L2HDR(hdr))
2110 arc_hdr_l2hdr_destroy(hdr);
428870ff
BB
2111
2112 if (!buflist_held)
d962d5da 2113 mutex_exit(&dev->l2ad_mtx);
34dc7c2f
BB
2114 }
2115
b9541d6b 2116 if (!BUF_EMPTY(hdr))
428870ff 2117 buf_discard_identity(hdr);
b9541d6b 2118
34dc7c2f
BB
2119 if (hdr->b_freeze_cksum != NULL) {
2120 kmem_free(hdr->b_freeze_cksum, sizeof (zio_cksum_t));
2121 hdr->b_freeze_cksum = NULL;
2122 }
2123
b9541d6b
CW
2124 if (HDR_HAS_L1HDR(hdr)) {
2125 while (hdr->b_l1hdr.b_buf) {
2126 arc_buf_t *buf = hdr->b_l1hdr.b_buf;
2127
2128 if (buf->b_efunc != NULL) {
ca0bf58d 2129 mutex_enter(&arc_user_evicts_lock);
b9541d6b
CW
2130 mutex_enter(&buf->b_evict_lock);
2131 ASSERT(buf->b_hdr != NULL);
ca0bf58d 2132 arc_buf_destroy(hdr->b_l1hdr.b_buf, FALSE);
b9541d6b
CW
2133 hdr->b_l1hdr.b_buf = buf->b_next;
2134 buf->b_hdr = &arc_eviction_hdr;
2135 buf->b_next = arc_eviction_list;
2136 arc_eviction_list = buf;
2137 mutex_exit(&buf->b_evict_lock);
ca0bf58d
PS
2138 cv_signal(&arc_user_evicts_cv);
2139 mutex_exit(&arc_user_evicts_lock);
b9541d6b 2140 } else {
ca0bf58d 2141 arc_buf_destroy(hdr->b_l1hdr.b_buf, TRUE);
b9541d6b
CW
2142 }
2143 }
2144 }
2145
34dc7c2f 2146 ASSERT3P(hdr->b_hash_next, ==, NULL);
b9541d6b 2147 if (HDR_HAS_L1HDR(hdr)) {
ca0bf58d 2148 ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node));
b9541d6b
CW
2149 ASSERT3P(hdr->b_l1hdr.b_acb, ==, NULL);
2150 kmem_cache_free(hdr_full_cache, hdr);
2151 } else {
2152 kmem_cache_free(hdr_l2only_cache, hdr);
2153 }
34dc7c2f
BB
2154}
2155
2156void
2157arc_buf_free(arc_buf_t *buf, void *tag)
2158{
2159 arc_buf_hdr_t *hdr = buf->b_hdr;
b9541d6b 2160 int hashed = hdr->b_l1hdr.b_state != arc_anon;
34dc7c2f
BB
2161
2162 ASSERT(buf->b_efunc == NULL);
2163 ASSERT(buf->b_data != NULL);
2164
2165 if (hashed) {
2166 kmutex_t *hash_lock = HDR_LOCK(hdr);
2167
2168 mutex_enter(hash_lock);
428870ff
BB
2169 hdr = buf->b_hdr;
2170 ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
2171
34dc7c2f 2172 (void) remove_reference(hdr, hash_lock, tag);
b9541d6b 2173 if (hdr->b_l1hdr.b_datacnt > 1) {
ca0bf58d 2174 arc_buf_destroy(buf, TRUE);
428870ff 2175 } else {
b9541d6b 2176 ASSERT(buf == hdr->b_l1hdr.b_buf);
428870ff 2177 ASSERT(buf->b_efunc == NULL);
2a432414 2178 hdr->b_flags |= ARC_FLAG_BUF_AVAILABLE;
428870ff 2179 }
34dc7c2f
BB
2180 mutex_exit(hash_lock);
2181 } else if (HDR_IO_IN_PROGRESS(hdr)) {
2182 int destroy_hdr;
2183 /*
2184 * We are in the middle of an async write. Don't destroy
2185 * this buffer unless the write completes before we finish
2186 * decrementing the reference count.
2187 */
ca0bf58d 2188 mutex_enter(&arc_user_evicts_lock);
34dc7c2f 2189 (void) remove_reference(hdr, NULL, tag);
b9541d6b 2190 ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
34dc7c2f 2191 destroy_hdr = !HDR_IO_IN_PROGRESS(hdr);
ca0bf58d 2192 mutex_exit(&arc_user_evicts_lock);
34dc7c2f
BB
2193 if (destroy_hdr)
2194 arc_hdr_destroy(hdr);
2195 } else {
428870ff 2196 if (remove_reference(hdr, NULL, tag) > 0)
ca0bf58d 2197 arc_buf_destroy(buf, TRUE);
428870ff 2198 else
34dc7c2f 2199 arc_hdr_destroy(hdr);
34dc7c2f
BB
2200 }
2201}
2202
13fe0198 2203boolean_t
34dc7c2f
BB
2204arc_buf_remove_ref(arc_buf_t *buf, void* tag)
2205{
2206 arc_buf_hdr_t *hdr = buf->b_hdr;
b4f7f105 2207 kmutex_t *hash_lock = NULL;
13fe0198 2208 boolean_t no_callback = (buf->b_efunc == NULL);
34dc7c2f 2209
b9541d6b
CW
2210 if (hdr->b_l1hdr.b_state == arc_anon) {
2211 ASSERT(hdr->b_l1hdr.b_datacnt == 1);
34dc7c2f
BB
2212 arc_buf_free(buf, tag);
2213 return (no_callback);
2214 }
2215
b4f7f105 2216 hash_lock = HDR_LOCK(hdr);
34dc7c2f 2217 mutex_enter(hash_lock);
428870ff 2218 hdr = buf->b_hdr;
b9541d6b 2219 ASSERT(hdr->b_l1hdr.b_datacnt > 0);
428870ff 2220 ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
b9541d6b 2221 ASSERT(hdr->b_l1hdr.b_state != arc_anon);
34dc7c2f
BB
2222 ASSERT(buf->b_data != NULL);
2223
2224 (void) remove_reference(hdr, hash_lock, tag);
b9541d6b 2225 if (hdr->b_l1hdr.b_datacnt > 1) {
34dc7c2f 2226 if (no_callback)
ca0bf58d 2227 arc_buf_destroy(buf, TRUE);
34dc7c2f 2228 } else if (no_callback) {
b9541d6b 2229 ASSERT(hdr->b_l1hdr.b_buf == buf && buf->b_next == NULL);
428870ff 2230 ASSERT(buf->b_efunc == NULL);
2a432414 2231 hdr->b_flags |= ARC_FLAG_BUF_AVAILABLE;
34dc7c2f 2232 }
b9541d6b
CW
2233 ASSERT(no_callback || hdr->b_l1hdr.b_datacnt > 1 ||
2234 refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
34dc7c2f
BB
2235 mutex_exit(hash_lock);
2236 return (no_callback);
2237}
2238
5f6d0b6f 2239uint64_t
34dc7c2f
BB
2240arc_buf_size(arc_buf_t *buf)
2241{
2242 return (buf->b_hdr->b_size);
2243}
2244
1eb5bfa3
GW
2245/*
2246 * Called from the DMU to determine if the current buffer should be
2247 * evicted. In order to ensure proper locking, the eviction must be initiated
2248 * from the DMU. Return true if the buffer is associated with user data and
2249 * duplicate buffers still exist.
2250 */
2251boolean_t
2252arc_buf_eviction_needed(arc_buf_t *buf)
2253{
2254 arc_buf_hdr_t *hdr;
2255 boolean_t evict_needed = B_FALSE;
2256
2257 if (zfs_disable_dup_eviction)
2258 return (B_FALSE);
2259
2260 mutex_enter(&buf->b_evict_lock);
2261 hdr = buf->b_hdr;
2262 if (hdr == NULL) {
2263 /*
2264 * We are in arc_do_user_evicts(); let that function
2265 * perform the eviction.
2266 */
2267 ASSERT(buf->b_data == NULL);
2268 mutex_exit(&buf->b_evict_lock);
2269 return (B_FALSE);
2270 } else if (buf->b_data == NULL) {
2271 /*
2272 * We have already been added to the arc eviction list;
2273 * recommend eviction.
2274 */
2275 ASSERT3P(hdr, ==, &arc_eviction_hdr);
2276 mutex_exit(&buf->b_evict_lock);
2277 return (B_TRUE);
2278 }
2279
b9541d6b 2280 if (hdr->b_l1hdr.b_datacnt > 1 && HDR_ISTYPE_DATA(hdr))
1eb5bfa3
GW
2281 evict_needed = B_TRUE;
2282
2283 mutex_exit(&buf->b_evict_lock);
2284 return (evict_needed);
2285}
2286
34dc7c2f 2287/*
ca0bf58d
PS
2288 * Evict the arc_buf_hdr that is provided as a parameter. The resultant
2289 * state of the header is dependent on its state prior to entering this
2290 * function. The following transitions are possible:
34dc7c2f 2291 *
ca0bf58d
PS
2292 * - arc_mru -> arc_mru_ghost
2293 * - arc_mfu -> arc_mfu_ghost
2294 * - arc_mru_ghost -> arc_l2c_only
2295 * - arc_mru_ghost -> deleted
2296 * - arc_mfu_ghost -> arc_l2c_only
2297 * - arc_mfu_ghost -> deleted
34dc7c2f 2298 */
ca0bf58d
PS
2299static int64_t
2300arc_evict_hdr(arc_buf_hdr_t *hdr, kmutex_t *hash_lock)
34dc7c2f 2301{
ca0bf58d
PS
2302 arc_state_t *evicted_state, *state;
2303 int64_t bytes_evicted = 0;
34dc7c2f 2304
ca0bf58d
PS
2305 ASSERT(MUTEX_HELD(hash_lock));
2306 ASSERT(HDR_HAS_L1HDR(hdr));
e8b96c60 2307
ca0bf58d
PS
2308 state = hdr->b_l1hdr.b_state;
2309 if (GHOST_STATE(state)) {
2310 ASSERT(!HDR_IO_IN_PROGRESS(hdr));
2311 ASSERT(hdr->b_l1hdr.b_buf == NULL);
e8b96c60
MA
2312
2313 /*
ca0bf58d
PS
2314 * l2arc_write_buffers() relies on a header's L1 portion
2315 * (i.e. its b_tmp_cdata field) during its write phase.
2316 * Thus, we cannot push a header onto the arc_l2c_only
2317 * state (removing its L1 piece) until the header is
2318 * done being written to the l2arc.
e8b96c60 2319 */
ca0bf58d
PS
2320 if (HDR_HAS_L2HDR(hdr) && HDR_L2_WRITING(hdr)) {
2321 ARCSTAT_BUMP(arcstat_evict_l2_skip);
2322 return (bytes_evicted);
e8b96c60
MA
2323 }
2324
ca0bf58d
PS
2325 ARCSTAT_BUMP(arcstat_deleted);
2326 bytes_evicted += hdr->b_size;
428870ff 2327
ca0bf58d 2328 DTRACE_PROBE1(arc__delete, arc_buf_hdr_t *, hdr);
428870ff 2329
ca0bf58d
PS
2330 if (HDR_HAS_L2HDR(hdr)) {
2331 /*
2332 * This buffer is cached on the 2nd Level ARC;
2333 * don't destroy the header.
2334 */
2335 arc_change_state(arc_l2c_only, hdr, hash_lock);
2336 /*
2337 * dropping from L1+L2 cached to L2-only,
2338 * realloc to remove the L1 header.
2339 */
2340 hdr = arc_hdr_realloc(hdr, hdr_full_cache,
2341 hdr_l2only_cache);
34dc7c2f 2342 } else {
ca0bf58d
PS
2343 arc_change_state(arc_anon, hdr, hash_lock);
2344 arc_hdr_destroy(hdr);
34dc7c2f 2345 }
ca0bf58d 2346 return (bytes_evicted);
34dc7c2f
BB
2347 }
2348
ca0bf58d
PS
2349 ASSERT(state == arc_mru || state == arc_mfu);
2350 evicted_state = (state == arc_mru) ? arc_mru_ghost : arc_mfu_ghost;
34dc7c2f 2351
ca0bf58d
PS
2352 /* prefetch buffers have a minimum lifespan */
2353 if (HDR_IO_IN_PROGRESS(hdr) ||
2354 ((hdr->b_flags & (ARC_FLAG_PREFETCH | ARC_FLAG_INDIRECT)) &&
2355 ddi_get_lbolt() - hdr->b_l1hdr.b_arc_access <
2356 arc_min_prefetch_lifespan)) {
2357 ARCSTAT_BUMP(arcstat_evict_skip);
2358 return (bytes_evicted);
da8ccd0e
PS
2359 }
2360
ca0bf58d
PS
2361 ASSERT0(refcount_count(&hdr->b_l1hdr.b_refcnt));
2362 ASSERT3U(hdr->b_l1hdr.b_datacnt, >, 0);
2363 while (hdr->b_l1hdr.b_buf) {
2364 arc_buf_t *buf = hdr->b_l1hdr.b_buf;
2365 if (!mutex_tryenter(&buf->b_evict_lock)) {
2366 ARCSTAT_BUMP(arcstat_mutex_miss);
2367 break;
2368 }
2369 if (buf->b_data != NULL)
2370 bytes_evicted += hdr->b_size;
2371 if (buf->b_efunc != NULL) {
2372 mutex_enter(&arc_user_evicts_lock);
2373 arc_buf_destroy(buf, FALSE);
2374 hdr->b_l1hdr.b_buf = buf->b_next;
2375 buf->b_hdr = &arc_eviction_hdr;
2376 buf->b_next = arc_eviction_list;
2377 arc_eviction_list = buf;
2378 cv_signal(&arc_user_evicts_cv);
2379 mutex_exit(&arc_user_evicts_lock);
2380 mutex_exit(&buf->b_evict_lock);
2381 } else {
2382 mutex_exit(&buf->b_evict_lock);
2383 arc_buf_destroy(buf, TRUE);
2384 }
2385 }
34dc7c2f 2386
ca0bf58d
PS
2387 if (HDR_HAS_L2HDR(hdr)) {
2388 ARCSTAT_INCR(arcstat_evict_l2_cached, hdr->b_size);
2389 } else {
2390 if (l2arc_write_eligible(hdr->b_spa, hdr))
2391 ARCSTAT_INCR(arcstat_evict_l2_eligible, hdr->b_size);
2392 else
2393 ARCSTAT_INCR(arcstat_evict_l2_ineligible, hdr->b_size);
2394 }
34dc7c2f 2395
ca0bf58d
PS
2396 if (hdr->b_l1hdr.b_datacnt == 0) {
2397 arc_change_state(evicted_state, hdr, hash_lock);
2398 ASSERT(HDR_IN_HASH_TABLE(hdr));
2399 hdr->b_flags |= ARC_FLAG_IN_HASH_TABLE;
2400 hdr->b_flags &= ~ARC_FLAG_BUF_AVAILABLE;
2401 DTRACE_PROBE1(arc__evict, arc_buf_hdr_t *, hdr);
2402 }
34dc7c2f 2403
ca0bf58d 2404 return (bytes_evicted);
34dc7c2f
BB
2405}
2406
ca0bf58d
PS
2407static uint64_t
2408arc_evict_state_impl(multilist_t *ml, int idx, arc_buf_hdr_t *marker,
2409 uint64_t spa, int64_t bytes)
34dc7c2f 2410{
ca0bf58d
PS
2411 multilist_sublist_t *mls;
2412 uint64_t bytes_evicted = 0;
2413 arc_buf_hdr_t *hdr;
34dc7c2f 2414 kmutex_t *hash_lock;
ca0bf58d 2415 int evict_count = 0;
34dc7c2f 2416
ca0bf58d
PS
2417 ASSERT3P(marker, !=, NULL);
2418 ASSERTV(if (bytes < 0) ASSERT(bytes == ARC_EVICT_ALL));
2419
2420 mls = multilist_sublist_lock(ml, idx);
572e2857 2421
ca0bf58d
PS
2422 for (hdr = multilist_sublist_prev(mls, marker); hdr != NULL;
2423 hdr = multilist_sublist_prev(mls, marker)) {
2424 if ((bytes != ARC_EVICT_ALL && bytes_evicted >= bytes) ||
2425 (evict_count >= zfs_arc_evict_batch_limit))
2426 break;
2427
2428 /*
2429 * To keep our iteration location, move the marker
2430 * forward. Since we're not holding hdr's hash lock, we
2431 * must be very careful and not remove 'hdr' from the
2432 * sublist. Otherwise, other consumers might mistake the
2433 * 'hdr' as not being on a sublist when they call the
2434 * multilist_link_active() function (they all rely on
2435 * the hash lock protecting concurrent insertions and
2436 * removals). multilist_sublist_move_forward() was
2437 * specifically implemented to ensure this is the case
2438 * (only 'marker' will be removed and re-inserted).
2439 */
2440 multilist_sublist_move_forward(mls, marker);
2441
2442 /*
2443 * The only case where the b_spa field should ever be
2444 * zero, is the marker headers inserted by
2445 * arc_evict_state(). It's possible for multiple threads
2446 * to be calling arc_evict_state() concurrently (e.g.
2447 * dsl_pool_close() and zio_inject_fault()), so we must
2448 * skip any markers we see from these other threads.
2449 */
2a432414 2450 if (hdr->b_spa == 0)
572e2857
BB
2451 continue;
2452
ca0bf58d
PS
2453 /* we're only interested in evicting buffers of a certain spa */
2454 if (spa != 0 && hdr->b_spa != spa) {
2455 ARCSTAT_BUMP(arcstat_evict_skip);
428870ff 2456 continue;
ca0bf58d
PS
2457 }
2458
2459 hash_lock = HDR_LOCK(hdr);
e8b96c60
MA
2460
2461 /*
ca0bf58d
PS
2462 * We aren't calling this function from any code path
2463 * that would already be holding a hash lock, so we're
2464 * asserting on this assumption to be defensive in case
2465 * this ever changes. Without this check, it would be
2466 * possible to incorrectly increment arcstat_mutex_miss
2467 * below (e.g. if the code changed such that we called
2468 * this function with a hash lock held).
e8b96c60 2469 */
ca0bf58d
PS
2470 ASSERT(!MUTEX_HELD(hash_lock));
2471
34dc7c2f 2472 if (mutex_tryenter(hash_lock)) {
ca0bf58d
PS
2473 uint64_t evicted = arc_evict_hdr(hdr, hash_lock);
2474 mutex_exit(hash_lock);
34dc7c2f 2475
ca0bf58d 2476 bytes_evicted += evicted;
34dc7c2f 2477
572e2857 2478 /*
ca0bf58d
PS
2479 * If evicted is zero, arc_evict_hdr() must have
2480 * decided to skip this header, don't increment
2481 * evict_count in this case.
572e2857 2482 */
ca0bf58d
PS
2483 if (evicted != 0)
2484 evict_count++;
2485
2486 /*
2487 * If arc_size isn't overflowing, signal any
2488 * threads that might happen to be waiting.
2489 *
2490 * For each header evicted, we wake up a single
2491 * thread. If we used cv_broadcast, we could
2492 * wake up "too many" threads causing arc_size
2493 * to significantly overflow arc_c; since
2494 * arc_get_data_buf() doesn't check for overflow
2495 * when it's woken up (it doesn't because it's
2496 * possible for the ARC to be overflowing while
2497 * full of un-evictable buffers, and the
2498 * function should proceed in this case).
2499 *
2500 * If threads are left sleeping, due to not
2501 * using cv_broadcast, they will be woken up
2502 * just before arc_reclaim_thread() sleeps.
2503 */
2504 mutex_enter(&arc_reclaim_lock);
2505 if (!arc_is_overflowing())
2506 cv_signal(&arc_reclaim_waiters_cv);
2507 mutex_exit(&arc_reclaim_lock);
e8b96c60 2508 } else {
ca0bf58d 2509 ARCSTAT_BUMP(arcstat_mutex_miss);
e8b96c60 2510 }
34dc7c2f 2511 }
34dc7c2f 2512
ca0bf58d 2513 multilist_sublist_unlock(mls);
34dc7c2f 2514
ca0bf58d 2515 return (bytes_evicted);
34dc7c2f
BB
2516}
2517
ca0bf58d
PS
2518/*
2519 * Evict buffers from the given arc state, until we've removed the
2520 * specified number of bytes. Move the removed buffers to the
2521 * appropriate evict state.
2522 *
2523 * This function makes a "best effort". It skips over any buffers
2524 * it can't get a hash_lock on, and so, may not catch all candidates.
2525 * It may also return without evicting as much space as requested.
2526 *
2527 * If bytes is specified using the special value ARC_EVICT_ALL, this
2528 * will evict all available (i.e. unlocked and evictable) buffers from
2529 * the given arc state; which is used by arc_flush().
2530 */
2531static uint64_t
2532arc_evict_state(arc_state_t *state, uint64_t spa, int64_t bytes,
2533 arc_buf_contents_t type)
34dc7c2f 2534{
ca0bf58d
PS
2535 uint64_t total_evicted = 0;
2536 multilist_t *ml = &state->arcs_list[type];
2537 int num_sublists;
2538 arc_buf_hdr_t **markers;
2539 int i;
2540
2541 ASSERTV(if (bytes < 0) ASSERT(bytes == ARC_EVICT_ALL));
2542
2543 num_sublists = multilist_get_num_sublists(ml);
d164b209
BB
2544
2545 /*
ca0bf58d
PS
2546 * If we've tried to evict from each sublist, made some
2547 * progress, but still have not hit the target number of bytes
2548 * to evict, we want to keep trying. The markers allow us to
2549 * pick up where we left off for each individual sublist, rather
2550 * than starting from the tail each time.
d164b209 2551 */
ca0bf58d
PS
2552 markers = kmem_zalloc(sizeof (*markers) * num_sublists, KM_SLEEP);
2553 for (i = 0; i < num_sublists; i++) {
2554 multilist_sublist_t *mls;
34dc7c2f 2555
ca0bf58d
PS
2556 markers[i] = kmem_cache_alloc(hdr_full_cache, KM_SLEEP);
2557
2558 /*
2559 * A b_spa of 0 is used to indicate that this header is
2560 * a marker. This fact is used in arc_adjust_type() and
2561 * arc_evict_state_impl().
2562 */
2563 markers[i]->b_spa = 0;
34dc7c2f 2564
ca0bf58d
PS
2565 mls = multilist_sublist_lock(ml, i);
2566 multilist_sublist_insert_tail(mls, markers[i]);
2567 multilist_sublist_unlock(mls);
34dc7c2f
BB
2568 }
2569
d164b209 2570 /*
ca0bf58d
PS
2571 * While we haven't hit our target number of bytes to evict, or
2572 * we're evicting all available buffers.
d164b209 2573 */
ca0bf58d
PS
2574 while (total_evicted < bytes || bytes == ARC_EVICT_ALL) {
2575 /*
2576 * Start eviction using a randomly selected sublist,
2577 * this is to try and evenly balance eviction across all
2578 * sublists. Always starting at the same sublist
2579 * (e.g. index 0) would cause evictions to favor certain
2580 * sublists over others.
2581 */
2582 int sublist_idx = multilist_get_random_index(ml);
2583 uint64_t scan_evicted = 0;
34dc7c2f 2584
ca0bf58d
PS
2585 for (i = 0; i < num_sublists; i++) {
2586 uint64_t bytes_remaining;
2587 uint64_t bytes_evicted;
d164b209 2588
ca0bf58d
PS
2589 if (bytes == ARC_EVICT_ALL)
2590 bytes_remaining = ARC_EVICT_ALL;
2591 else if (total_evicted < bytes)
2592 bytes_remaining = bytes - total_evicted;
2593 else
2594 break;
34dc7c2f 2595
ca0bf58d
PS
2596 bytes_evicted = arc_evict_state_impl(ml, sublist_idx,
2597 markers[sublist_idx], spa, bytes_remaining);
2598
2599 scan_evicted += bytes_evicted;
2600 total_evicted += bytes_evicted;
2601
2602 /* we've reached the end, wrap to the beginning */
2603 if (++sublist_idx >= num_sublists)
2604 sublist_idx = 0;
2605 }
2606
2607 /*
2608 * If we didn't evict anything during this scan, we have
2609 * no reason to believe we'll evict more during another
2610 * scan, so break the loop.
2611 */
2612 if (scan_evicted == 0) {
2613 /* This isn't possible, let's make that obvious */
2614 ASSERT3S(bytes, !=, 0);
34dc7c2f 2615
ca0bf58d
PS
2616 /*
2617 * When bytes is ARC_EVICT_ALL, the only way to
2618 * break the loop is when scan_evicted is zero.
2619 * In that case, we actually have evicted enough,
2620 * so we don't want to increment the kstat.
2621 */
2622 if (bytes != ARC_EVICT_ALL) {
2623 ASSERT3S(total_evicted, <, bytes);
2624 ARCSTAT_BUMP(arcstat_evict_not_enough);
2625 }
d164b209 2626
ca0bf58d
PS
2627 break;
2628 }
d164b209 2629 }
34dc7c2f 2630
ca0bf58d
PS
2631 for (i = 0; i < num_sublists; i++) {
2632 multilist_sublist_t *mls = multilist_sublist_lock(ml, i);
2633 multilist_sublist_remove(mls, markers[i]);
2634 multilist_sublist_unlock(mls);
34dc7c2f 2635
ca0bf58d 2636 kmem_cache_free(hdr_full_cache, markers[i]);
34dc7c2f 2637 }
ca0bf58d
PS
2638 kmem_free(markers, sizeof (*markers) * num_sublists);
2639
2640 return (total_evicted);
2641}
2642
2643/*
2644 * Flush all "evictable" data of the given type from the arc state
2645 * specified. This will not evict any "active" buffers (i.e. referenced).
2646 *
2647 * When 'retry' is set to FALSE, the function will make a single pass
2648 * over the state and evict any buffers that it can. Since it doesn't
2649 * continually retry the eviction, it might end up leaving some buffers
2650 * in the ARC due to lock misses.
2651 *
2652 * When 'retry' is set to TRUE, the function will continually retry the
2653 * eviction until *all* evictable buffers have been removed from the
2654 * state. As a result, if concurrent insertions into the state are
2655 * allowed (e.g. if the ARC isn't shutting down), this function might
2656 * wind up in an infinite loop, continually trying to evict buffers.
2657 */
2658static uint64_t
2659arc_flush_state(arc_state_t *state, uint64_t spa, arc_buf_contents_t type,
2660 boolean_t retry)
2661{
2662 uint64_t evicted = 0;
2663
2664 while (state->arcs_lsize[type] != 0) {
2665 evicted += arc_evict_state(state, spa, ARC_EVICT_ALL, type);
2666
2667 if (!retry)
2668 break;
2669 }
2670
2671 return (evicted);
34dc7c2f
BB
2672}
2673
ab26409d 2674/*
f6046738
BB
2675 * Helper function for arc_prune() it is responsible for safely handling
2676 * the execution of a registered arc_prune_func_t.
ab26409d
BB
2677 */
2678static void
f6046738 2679arc_prune_task(void *ptr)
ab26409d 2680{
f6046738
BB
2681 arc_prune_t *ap = (arc_prune_t *)ptr;
2682 arc_prune_func_t *func = ap->p_pfunc;
ab26409d 2683
f6046738
BB
2684 if (func != NULL)
2685 func(ap->p_adjust, ap->p_private);
ab26409d 2686
f6046738
BB
2687 /* Callback unregistered concurrently with execution */
2688 if (refcount_remove(&ap->p_refcnt, func) == 0) {
2689 ASSERT(!list_link_active(&ap->p_node));
2690 refcount_destroy(&ap->p_refcnt);
2691 kmem_free(ap, sizeof (*ap));
2692 }
2693}
ab26409d 2694
f6046738
BB
2695/*
2696 * Notify registered consumers they must drop holds on a portion of the ARC
2697 * buffered they reference. This provides a mechanism to ensure the ARC can
2698 * honor the arc_meta_limit and reclaim otherwise pinned ARC buffers. This
2699 * is analogous to dnlc_reduce_cache() but more generic.
2700 *
2701 * This operation is performed asyncronously so it may be safely called
ca67b33a 2702 * in the context of the arc_reclaim_thread(). A reference is taken here
f6046738
BB
2703 * for each registered arc_prune_t and the arc_prune_task() is responsible
2704 * for releasing it once the registered arc_prune_func_t has completed.
2705 */
2706static void
2707arc_prune_async(int64_t adjust)
2708{
2709 arc_prune_t *ap;
ab26409d 2710
f6046738
BB
2711 mutex_enter(&arc_prune_mtx);
2712 for (ap = list_head(&arc_prune_list); ap != NULL;
2713 ap = list_next(&arc_prune_list, ap)) {
ab26409d 2714
f6046738
BB
2715 if (refcount_count(&ap->p_refcnt) >= 2)
2716 continue;
ab26409d 2717
f6046738
BB
2718 refcount_add(&ap->p_refcnt, ap->p_pfunc);
2719 ap->p_adjust = adjust;
2720 taskq_dispatch(arc_prune_taskq, arc_prune_task, ap, TQ_SLEEP);
2721 ARCSTAT_BUMP(arcstat_prune);
ab26409d 2722 }
ab26409d
BB
2723 mutex_exit(&arc_prune_mtx);
2724}
2725
f6046738
BB
2726static void
2727arc_prune(int64_t adjust)
2728{
2729 arc_prune_async(adjust);
2730 taskq_wait_outstanding(arc_prune_taskq, 0);
2731}
2732
ca0bf58d
PS
2733/*
2734 * Evict the specified number of bytes from the state specified,
2735 * restricting eviction to the spa and type given. This function
2736 * prevents us from trying to evict more from a state's list than
2737 * is "evictable", and to skip evicting altogether when passed a
2738 * negative value for "bytes". In contrast, arc_evict_state() will
2739 * evict everything it can, when passed a negative value for "bytes".
2740 */
2741static uint64_t
2742arc_adjust_impl(arc_state_t *state, uint64_t spa, int64_t bytes,
2743 arc_buf_contents_t type)
2744{
2745 int64_t delta;
2746
2747 if (bytes > 0 && state->arcs_lsize[type] > 0) {
2748 delta = MIN(state->arcs_lsize[type], bytes);
2749 return (arc_evict_state(state, spa, delta, type));
2750 }
2751
2752 return (0);
2753}
2754
2755/*
2756 * The goal of this function is to evict enough meta data buffers from the
2757 * ARC in order to enforce the arc_meta_limit. Achieving this is slightly
2758 * more complicated than it appears because it is common for data buffers
2759 * to have holds on meta data buffers. In addition, dnode meta data buffers
2760 * will be held by the dnodes in the block preventing them from being freed.
2761 * This means we can't simply traverse the ARC and expect to always find
2762 * enough unheld meta data buffer to release.
2763 *
2764 * Therefore, this function has been updated to make alternating passes
2765 * over the ARC releasing data buffers and then newly unheld meta data
2766 * buffers. This ensures forward progress is maintained and arc_meta_used
2767 * will decrease. Normally this is sufficient, but if required the ARC
2768 * will call the registered prune callbacks causing dentry and inodes to
2769 * be dropped from the VFS cache. This will make dnode meta data buffers
2770 * available for reclaim.
2771 */
2772static uint64_t
f6046738 2773arc_adjust_meta_balanced(void)
ca0bf58d
PS
2774{
2775 int64_t adjustmnt, delta, prune = 0;
2776 uint64_t total_evicted = 0;
2777 arc_buf_contents_t type = ARC_BUFC_DATA;
ca67b33a 2778 int restarts = MAX(zfs_arc_meta_adjust_restarts, 0);
ca0bf58d
PS
2779
2780restart:
2781 /*
2782 * This slightly differs than the way we evict from the mru in
2783 * arc_adjust because we don't have a "target" value (i.e. no
2784 * "meta" arc_p). As a result, I think we can completely
2785 * cannibalize the metadata in the MRU before we evict the
2786 * metadata from the MFU. I think we probably need to implement a
2787 * "metadata arc_p" value to do this properly.
2788 */
2789 adjustmnt = arc_meta_used - arc_meta_limit;
2790
2791 if (adjustmnt > 0 && arc_mru->arcs_lsize[type] > 0) {
2792 delta = MIN(arc_mru->arcs_lsize[type], adjustmnt);
2793 total_evicted += arc_adjust_impl(arc_mru, 0, delta, type);
2794 adjustmnt -= delta;
2795 }
2796
2797 /*
2798 * We can't afford to recalculate adjustmnt here. If we do,
2799 * new metadata buffers can sneak into the MRU or ANON lists,
2800 * thus penalize the MFU metadata. Although the fudge factor is
2801 * small, it has been empirically shown to be significant for
2802 * certain workloads (e.g. creating many empty directories). As
2803 * such, we use the original calculation for adjustmnt, and
2804 * simply decrement the amount of data evicted from the MRU.
2805 */
2806
2807 if (adjustmnt > 0 && arc_mfu->arcs_lsize[type] > 0) {
2808 delta = MIN(arc_mfu->arcs_lsize[type], adjustmnt);
2809 total_evicted += arc_adjust_impl(arc_mfu, 0, delta, type);
2810 }
2811
2812 adjustmnt = arc_meta_used - arc_meta_limit;
2813
2814 if (adjustmnt > 0 && arc_mru_ghost->arcs_lsize[type] > 0) {
2815 delta = MIN(adjustmnt,
2816 arc_mru_ghost->arcs_lsize[type]);
2817 total_evicted += arc_adjust_impl(arc_mru_ghost, 0, delta, type);
2818 adjustmnt -= delta;
2819 }
2820
2821 if (adjustmnt > 0 && arc_mfu_ghost->arcs_lsize[type] > 0) {
2822 delta = MIN(adjustmnt,
2823 arc_mfu_ghost->arcs_lsize[type]);
2824 total_evicted += arc_adjust_impl(arc_mfu_ghost, 0, delta, type);
2825 }
2826
2827 /*
2828 * If after attempting to make the requested adjustment to the ARC
2829 * the meta limit is still being exceeded then request that the
2830 * higher layers drop some cached objects which have holds on ARC
2831 * meta buffers. Requests to the upper layers will be made with
2832 * increasingly large scan sizes until the ARC is below the limit.
2833 */
2834 if (arc_meta_used > arc_meta_limit) {
2835 if (type == ARC_BUFC_DATA) {
2836 type = ARC_BUFC_METADATA;
2837 } else {
2838 type = ARC_BUFC_DATA;
2839
2840 if (zfs_arc_meta_prune) {
2841 prune += zfs_arc_meta_prune;
f6046738 2842 arc_prune_async(prune);
ca0bf58d
PS
2843 }
2844 }
2845
2846 if (restarts > 0) {
2847 restarts--;
2848 goto restart;
2849 }
2850 }
2851 return (total_evicted);
2852}
2853
f6046738
BB
2854/*
2855 * Evict metadata buffers from the cache, such that arc_meta_used is
2856 * capped by the arc_meta_limit tunable.
2857 */
2858static uint64_t
2859arc_adjust_meta_only(void)
2860{
2861 uint64_t total_evicted = 0;
2862 int64_t target;
2863
2864 /*
2865 * If we're over the meta limit, we want to evict enough
2866 * metadata to get back under the meta limit. We don't want to
2867 * evict so much that we drop the MRU below arc_p, though. If
2868 * we're over the meta limit more than we're over arc_p, we
2869 * evict some from the MRU here, and some from the MFU below.
2870 */
2871 target = MIN((int64_t)(arc_meta_used - arc_meta_limit),
36da08ef
PS
2872 (int64_t)(refcount_count(&arc_anon->arcs_size) +
2873 refcount_count(&arc_mru->arcs_size) - arc_p));
f6046738
BB
2874
2875 total_evicted += arc_adjust_impl(arc_mru, 0, target, ARC_BUFC_METADATA);
2876
2877 /*
2878 * Similar to the above, we want to evict enough bytes to get us
2879 * below the meta limit, but not so much as to drop us below the
2880 * space alloted to the MFU (which is defined as arc_c - arc_p).
2881 */
2882 target = MIN((int64_t)(arc_meta_used - arc_meta_limit),
36da08ef 2883 (int64_t)(refcount_count(&arc_mfu->arcs_size) - (arc_c - arc_p)));
f6046738
BB
2884
2885 total_evicted += arc_adjust_impl(arc_mfu, 0, target, ARC_BUFC_METADATA);
2886
2887 return (total_evicted);
2888}
2889
2890static uint64_t
2891arc_adjust_meta(void)
2892{
2893 if (zfs_arc_meta_strategy == ARC_STRATEGY_META_ONLY)
2894 return (arc_adjust_meta_only());
2895 else
2896 return (arc_adjust_meta_balanced());
2897}
2898
ca0bf58d
PS
2899/*
2900 * Return the type of the oldest buffer in the given arc state
2901 *
2902 * This function will select a random sublist of type ARC_BUFC_DATA and
2903 * a random sublist of type ARC_BUFC_METADATA. The tail of each sublist
2904 * is compared, and the type which contains the "older" buffer will be
2905 * returned.
2906 */
2907static arc_buf_contents_t
2908arc_adjust_type(arc_state_t *state)
2909{
2910 multilist_t *data_ml = &state->arcs_list[ARC_BUFC_DATA];
2911 multilist_t *meta_ml = &state->arcs_list[ARC_BUFC_METADATA];
2912 int data_idx = multilist_get_random_index(data_ml);
2913 int meta_idx = multilist_get_random_index(meta_ml);
2914 multilist_sublist_t *data_mls;
2915 multilist_sublist_t *meta_mls;
2916 arc_buf_contents_t type;
2917 arc_buf_hdr_t *data_hdr;
2918 arc_buf_hdr_t *meta_hdr;
2919
2920 /*
2921 * We keep the sublist lock until we're finished, to prevent
2922 * the headers from being destroyed via arc_evict_state().
2923 */
2924 data_mls = multilist_sublist_lock(data_ml, data_idx);
2925 meta_mls = multilist_sublist_lock(meta_ml, meta_idx);
2926
2927 /*
2928 * These two loops are to ensure we skip any markers that
2929 * might be at the tail of the lists due to arc_evict_state().
2930 */
2931
2932 for (data_hdr = multilist_sublist_tail(data_mls); data_hdr != NULL;
2933 data_hdr = multilist_sublist_prev(data_mls, data_hdr)) {
2934 if (data_hdr->b_spa != 0)
2935 break;
2936 }
2937
2938 for (meta_hdr = multilist_sublist_tail(meta_mls); meta_hdr != NULL;
2939 meta_hdr = multilist_sublist_prev(meta_mls, meta_hdr)) {
2940 if (meta_hdr->b_spa != 0)
2941 break;
2942 }
2943
2944 if (data_hdr == NULL && meta_hdr == NULL) {
2945 type = ARC_BUFC_DATA;
2946 } else if (data_hdr == NULL) {
2947 ASSERT3P(meta_hdr, !=, NULL);
2948 type = ARC_BUFC_METADATA;
2949 } else if (meta_hdr == NULL) {
2950 ASSERT3P(data_hdr, !=, NULL);
2951 type = ARC_BUFC_DATA;
2952 } else {
2953 ASSERT3P(data_hdr, !=, NULL);
2954 ASSERT3P(meta_hdr, !=, NULL);
2955
2956 /* The headers can't be on the sublist without an L1 header */
2957 ASSERT(HDR_HAS_L1HDR(data_hdr));
2958 ASSERT(HDR_HAS_L1HDR(meta_hdr));
2959
2960 if (data_hdr->b_l1hdr.b_arc_access <
2961 meta_hdr->b_l1hdr.b_arc_access) {
2962 type = ARC_BUFC_DATA;
2963 } else {
2964 type = ARC_BUFC_METADATA;
2965 }
2966 }
2967
2968 multilist_sublist_unlock(meta_mls);
2969 multilist_sublist_unlock(data_mls);
2970
2971 return (type);
2972}
2973
2974/*
2975 * Evict buffers from the cache, such that arc_size is capped by arc_c.
2976 */
2977static uint64_t
2978arc_adjust(void)
2979{
2980 uint64_t total_evicted = 0;
2981 uint64_t bytes;
2982 int64_t target;
2983
2984 /*
2985 * If we're over arc_meta_limit, we want to correct that before
2986 * potentially evicting data buffers below.
2987 */
2988 total_evicted += arc_adjust_meta();
2989
2990 /*
2991 * Adjust MRU size
2992 *
2993 * If we're over the target cache size, we want to evict enough
2994 * from the list to get back to our target size. We don't want
2995 * to evict too much from the MRU, such that it drops below
2996 * arc_p. So, if we're over our target cache size more than
2997 * the MRU is over arc_p, we'll evict enough to get back to
2998 * arc_p here, and then evict more from the MFU below.
2999 */
3000 target = MIN((int64_t)(arc_size - arc_c),
36da08ef
PS
3001 (int64_t)(refcount_count(&arc_anon->arcs_size) +
3002 refcount_count(&arc_mru->arcs_size) + arc_meta_used - arc_p));
ca0bf58d
PS
3003
3004 /*
3005 * If we're below arc_meta_min, always prefer to evict data.
3006 * Otherwise, try to satisfy the requested number of bytes to
3007 * evict from the type which contains older buffers; in an
3008 * effort to keep newer buffers in the cache regardless of their
3009 * type. If we cannot satisfy the number of bytes from this
3010 * type, spill over into the next type.
3011 */
3012 if (arc_adjust_type(arc_mru) == ARC_BUFC_METADATA &&
3013 arc_meta_used > arc_meta_min) {
3014 bytes = arc_adjust_impl(arc_mru, 0, target, ARC_BUFC_METADATA);
3015 total_evicted += bytes;
3016
3017 /*
3018 * If we couldn't evict our target number of bytes from
3019 * metadata, we try to get the rest from data.
3020 */
3021 target -= bytes;
3022
3023 total_evicted +=
3024 arc_adjust_impl(arc_mru, 0, target, ARC_BUFC_DATA);
3025 } else {
3026 bytes = arc_adjust_impl(arc_mru, 0, target, ARC_BUFC_DATA);
3027 total_evicted += bytes;
3028
3029 /*
3030 * If we couldn't evict our target number of bytes from
3031 * data, we try to get the rest from metadata.
3032 */
3033 target -= bytes;
3034
3035 total_evicted +=
3036 arc_adjust_impl(arc_mru, 0, target, ARC_BUFC_METADATA);
3037 }
3038
3039 /*
3040 * Adjust MFU size
3041 *
3042 * Now that we've tried to evict enough from the MRU to get its
3043 * size back to arc_p, if we're still above the target cache
3044 * size, we evict the rest from the MFU.
3045 */
3046 target = arc_size - arc_c;
3047
a7b10a93 3048 if (arc_adjust_type(arc_mfu) == ARC_BUFC_METADATA &&
ca0bf58d
PS
3049 arc_meta_used > arc_meta_min) {
3050 bytes = arc_adjust_impl(arc_mfu, 0, target, ARC_BUFC_METADATA);
3051 total_evicted += bytes;
3052
3053 /*
3054 * If we couldn't evict our target number of bytes from
3055 * metadata, we try to get the rest from data.
3056 */
3057 target -= bytes;
3058
3059 total_evicted +=
3060 arc_adjust_impl(arc_mfu, 0, target, ARC_BUFC_DATA);
3061 } else {
3062 bytes = arc_adjust_impl(arc_mfu, 0, target, ARC_BUFC_DATA);
3063 total_evicted += bytes;
3064
3065 /*
3066 * If we couldn't evict our target number of bytes from
3067 * data, we try to get the rest from data.
3068 */
3069 target -= bytes;
3070
3071 total_evicted +=
3072 arc_adjust_impl(arc_mfu, 0, target, ARC_BUFC_METADATA);
3073 }
3074
3075 /*
3076 * Adjust ghost lists
3077 *
3078 * In addition to the above, the ARC also defines target values
3079 * for the ghost lists. The sum of the mru list and mru ghost
3080 * list should never exceed the target size of the cache, and
3081 * the sum of the mru list, mfu list, mru ghost list, and mfu
3082 * ghost list should never exceed twice the target size of the
3083 * cache. The following logic enforces these limits on the ghost
3084 * caches, and evicts from them as needed.
3085 */
36da08ef
PS
3086 target = refcount_count(&arc_mru->arcs_size) +
3087 refcount_count(&arc_mru_ghost->arcs_size) - arc_c;
ca0bf58d
PS
3088
3089 bytes = arc_adjust_impl(arc_mru_ghost, 0, target, ARC_BUFC_DATA);
3090 total_evicted += bytes;
3091
3092 target -= bytes;
3093
3094 total_evicted +=
3095 arc_adjust_impl(arc_mru_ghost, 0, target, ARC_BUFC_METADATA);
3096
3097 /*
3098 * We assume the sum of the mru list and mfu list is less than
3099 * or equal to arc_c (we enforced this above), which means we
3100 * can use the simpler of the two equations below:
3101 *
3102 * mru + mfu + mru ghost + mfu ghost <= 2 * arc_c
3103 * mru ghost + mfu ghost <= arc_c
3104 */
36da08ef
PS
3105 target = refcount_count(&arc_mru_ghost->arcs_size) +
3106 refcount_count(&arc_mfu_ghost->arcs_size) - arc_c;
ca0bf58d
PS
3107
3108 bytes = arc_adjust_impl(arc_mfu_ghost, 0, target, ARC_BUFC_DATA);
3109 total_evicted += bytes;
3110
3111 target -= bytes;
3112
3113 total_evicted +=
3114 arc_adjust_impl(arc_mfu_ghost, 0, target, ARC_BUFC_METADATA);
3115
3116 return (total_evicted);
3117}
3118
34dc7c2f
BB
3119static void
3120arc_do_user_evicts(void)
3121{
ca0bf58d 3122 mutex_enter(&arc_user_evicts_lock);
34dc7c2f
BB
3123 while (arc_eviction_list != NULL) {
3124 arc_buf_t *buf = arc_eviction_list;
3125 arc_eviction_list = buf->b_next;
428870ff 3126 mutex_enter(&buf->b_evict_lock);
34dc7c2f 3127 buf->b_hdr = NULL;
428870ff 3128 mutex_exit(&buf->b_evict_lock);
ca0bf58d 3129 mutex_exit(&arc_user_evicts_lock);
34dc7c2f
BB
3130
3131 if (buf->b_efunc != NULL)
bd089c54 3132 VERIFY0(buf->b_efunc(buf->b_private));
34dc7c2f
BB
3133
3134 buf->b_efunc = NULL;
3135 buf->b_private = NULL;
3136 kmem_cache_free(buf_cache, buf);
ca0bf58d 3137 mutex_enter(&arc_user_evicts_lock);
34dc7c2f 3138 }
ca0bf58d 3139 mutex_exit(&arc_user_evicts_lock);
34dc7c2f
BB
3140}
3141
ca0bf58d
PS
3142void
3143arc_flush(spa_t *spa, boolean_t retry)
ab26409d 3144{
ca0bf58d 3145 uint64_t guid = 0;
94520ca4 3146
bc888666 3147 /*
ca0bf58d
PS
3148 * If retry is TRUE, a spa must not be specified since we have
3149 * no good way to determine if all of a spa's buffers have been
3150 * evicted from an arc state.
bc888666 3151 */
ca0bf58d 3152 ASSERT(!retry || spa == 0);
d164b209 3153
b9541d6b 3154 if (spa != NULL)
3541dc6d 3155 guid = spa_load_guid(spa);
d164b209 3156
ca0bf58d
PS
3157 (void) arc_flush_state(arc_mru, guid, ARC_BUFC_DATA, retry);
3158 (void) arc_flush_state(arc_mru, guid, ARC_BUFC_METADATA, retry);
3159
3160 (void) arc_flush_state(arc_mfu, guid, ARC_BUFC_DATA, retry);
3161 (void) arc_flush_state(arc_mfu, guid, ARC_BUFC_METADATA, retry);
3162
3163 (void) arc_flush_state(arc_mru_ghost, guid, ARC_BUFC_DATA, retry);
3164 (void) arc_flush_state(arc_mru_ghost, guid, ARC_BUFC_METADATA, retry);
34dc7c2f 3165
ca0bf58d
PS
3166 (void) arc_flush_state(arc_mfu_ghost, guid, ARC_BUFC_DATA, retry);
3167 (void) arc_flush_state(arc_mfu_ghost, guid, ARC_BUFC_METADATA, retry);
34dc7c2f 3168
34dc7c2f 3169 arc_do_user_evicts();
34dc7c2f
BB
3170 ASSERT(spa || arc_eviction_list == NULL);
3171}
3172
34dc7c2f 3173void
ca67b33a 3174arc_shrink(int64_t to_free)
34dc7c2f
BB
3175{
3176 if (arc_c > arc_c_min) {
302f753f 3177
34dc7c2f
BB
3178 if (arc_c > arc_c_min + to_free)
3179 atomic_add_64(&arc_c, -to_free);
3180 else
3181 arc_c = arc_c_min;
3182
ca67b33a 3183 atomic_add_64(&arc_p, -(arc_p >> arc_shrink_shift));
34dc7c2f
BB
3184 if (arc_c > arc_size)
3185 arc_c = MAX(arc_size, arc_c_min);
3186 if (arc_p > arc_c)
3187 arc_p = (arc_c >> 1);
3188 ASSERT(arc_c >= arc_c_min);
3189 ASSERT((int64_t)arc_p >= 0);
3190 }
3191
3192 if (arc_size > arc_c)
ca0bf58d 3193 (void) arc_adjust();
34dc7c2f
BB
3194}
3195
ca67b33a
MA
3196typedef enum free_memory_reason_t {
3197 FMR_UNKNOWN,
3198 FMR_NEEDFREE,
3199 FMR_LOTSFREE,
3200 FMR_SWAPFS_MINFREE,
3201 FMR_PAGES_PP_MAXIMUM,
3202 FMR_HEAP_ARENA,
3203 FMR_ZIO_ARENA,
3204} free_memory_reason_t;
3205
3206int64_t last_free_memory;
3207free_memory_reason_t last_free_reason;
3208
3209#ifdef _KERNEL
3210#ifdef __linux__
3211/*
3212 * expiration time for arc_no_grow set by direct memory reclaim.
3213 */
3214static clock_t arc_grow_time = 0;
3215#else
3216/*
3217 * Additional reserve of pages for pp_reserve.
3218 */
3219int64_t arc_pages_pp_reserve = 64;
3220
3221/*
3222 * Additional reserve of pages for swapfs.
3223 */
3224int64_t arc_swapfs_reserve = 64;
3225#endif
3226#endif /* _KERNEL */
3227
3228/*
3229 * Return the amount of memory that can be consumed before reclaim will be
3230 * needed. Positive if there is sufficient free memory, negative indicates
3231 * the amount of memory that needs to be freed up.
3232 */
3233static int64_t
3234arc_available_memory(void)
3235{
3236 int64_t lowest = INT64_MAX;
3237 free_memory_reason_t r = FMR_UNKNOWN;
3238
3239#ifdef _KERNEL
3240#ifdef __linux__
3241 /*
3242 * Under Linux we are not allowed to directly interrogate the global
3243 * memory state. Instead rely on observing that direct reclaim has
3244 * recently occurred therefore the system must be low on memory. The
3245 * exact values returned are not critical but should be small.
3246 */
3247 if (ddi_time_after_eq(ddi_get_lbolt(), arc_grow_time))
3248 lowest = PAGE_SIZE;
3249 else
3250 lowest = -PAGE_SIZE;
3251#else
3252 int64_t n;
3253
3254 /*
3255 * Platforms like illumos have greater visibility in to the memory
3256 * subsystem and can return a more detailed analysis of memory.
3257 */
3258 if (needfree > 0) {
3259 n = PAGESIZE * (-needfree);
3260 if (n < lowest) {
3261 lowest = n;
3262 r = FMR_NEEDFREE;
3263 }
3264 }
3265
3266 /*
3267 * check that we're out of range of the pageout scanner. It starts to
3268 * schedule paging if freemem is less than lotsfree and needfree.
3269 * lotsfree is the high-water mark for pageout, and needfree is the
3270 * number of needed free pages. We add extra pages here to make sure
3271 * the scanner doesn't start up while we're freeing memory.
3272 */
3273 n = PAGESIZE * (freemem - lotsfree - needfree - desfree);
3274 if (n < lowest) {
3275 lowest = n;
3276 r = FMR_LOTSFREE;
3277 }
3278
3279 /*
3280 * check to make sure that swapfs has enough space so that anon
3281 * reservations can still succeed. anon_resvmem() checks that the
3282 * availrmem is greater than swapfs_minfree, and the number of reserved
3283 * swap pages. We also add a bit of extra here just to prevent
3284 * circumstances from getting really dire.
3285 */
3286 n = PAGESIZE * (availrmem - swapfs_minfree - swapfs_reserve -
3287 desfree - arc_swapfs_reserve);
3288 if (n < lowest) {
3289 lowest = n;
3290 r = FMR_SWAPFS_MINFREE;
3291 }
3292
3293
3294 /*
3295 * Check that we have enough availrmem that memory locking (e.g., via
3296 * mlock(3C) or memcntl(2)) can still succeed. (pages_pp_maximum
3297 * stores the number of pages that cannot be locked; when availrmem
3298 * drops below pages_pp_maximum, page locking mechanisms such as
3299 * page_pp_lock() will fail.)
3300 */
3301 n = PAGESIZE * (availrmem - pages_pp_maximum -
3302 arc_pages_pp_reserve);
3303 if (n < lowest) {
3304 lowest = n;
3305 r = FMR_PAGES_PP_MAXIMUM;
3306 }
3307
3308#if defined(__i386)
3309 /*
3310 * If we're on an i386 platform, it's possible that we'll exhaust the
3311 * kernel heap space before we ever run out of available physical
3312 * memory. Most checks of the size of the heap_area compare against
3313 * tune.t_minarmem, which is the minimum available real memory that we
3314 * can have in the system. However, this is generally fixed at 25 pages
3315 * which is so low that it's useless. In this comparison, we seek to
3316 * calculate the total heap-size, and reclaim if more than 3/4ths of the
3317 * heap is allocated. (Or, in the calculation, if less than 1/4th is
3318 * free)
3319 */
3320 n = vmem_size(heap_arena, VMEM_FREE) -
3321 (vmem_size(heap_arena, VMEM_FREE | VMEM_ALLOC) >> 2);
3322 if (n < lowest) {
3323 lowest = n;
3324 r = FMR_HEAP_ARENA;
3325 }
3326#endif
3327
3328 /*
3329 * If zio data pages are being allocated out of a separate heap segment,
3330 * then enforce that the size of available vmem for this arena remains
3331 * above about 1/16th free.
3332 *
3333 * Note: The 1/16th arena free requirement was put in place
3334 * to aggressively evict memory from the arc in order to avoid
3335 * memory fragmentation issues.
3336 */
3337 if (zio_arena != NULL) {
3338 n = vmem_size(zio_arena, VMEM_FREE) -
3339 (vmem_size(zio_arena, VMEM_ALLOC) >> 4);
3340 if (n < lowest) {
3341 lowest = n;
3342 r = FMR_ZIO_ARENA;
3343 }
3344 }
3345#endif /* __linux__ */
3346#else
3347 /* Every 100 calls, free a small amount */
3348 if (spa_get_random(100) == 0)
3349 lowest = -1024;
3350#endif
3351
3352 last_free_memory = lowest;
3353 last_free_reason = r;
3354
3355 return (lowest);
3356}
3357
3358/*
3359 * Determine if the system is under memory pressure and is asking
3360 * to reclaim memory. A return value of TRUE indicates that the system
3361 * is under memory pressure and that the arc should adjust accordingly.
3362 */
3363static boolean_t
3364arc_reclaim_needed(void)
3365{
3366 return (arc_available_memory() < 0);
3367}
3368
34dc7c2f 3369static void
ca67b33a 3370arc_kmem_reap_now(void)
34dc7c2f
BB
3371{
3372 size_t i;
3373 kmem_cache_t *prev_cache = NULL;
3374 kmem_cache_t *prev_data_cache = NULL;
3375 extern kmem_cache_t *zio_buf_cache[];
3376 extern kmem_cache_t *zio_data_buf_cache[];
669dedb3 3377 extern kmem_cache_t *range_seg_cache;
34dc7c2f 3378
f6046738
BB
3379 if ((arc_meta_used >= arc_meta_limit) && zfs_arc_meta_prune) {
3380 /*
3381 * We are exceeding our meta-data cache limit.
3382 * Prune some entries to release holds on meta-data.
3383 */
3384 arc_prune(zfs_arc_meta_prune);
3385 }
3386
34dc7c2f
BB
3387 for (i = 0; i < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; i++) {
3388 if (zio_buf_cache[i] != prev_cache) {
3389 prev_cache = zio_buf_cache[i];
3390 kmem_cache_reap_now(zio_buf_cache[i]);
3391 }
3392 if (zio_data_buf_cache[i] != prev_data_cache) {
3393 prev_data_cache = zio_data_buf_cache[i];
3394 kmem_cache_reap_now(zio_data_buf_cache[i]);
3395 }
3396 }
ca0bf58d 3397 kmem_cache_reap_now(buf_cache);
b9541d6b
CW
3398 kmem_cache_reap_now(hdr_full_cache);
3399 kmem_cache_reap_now(hdr_l2only_cache);
669dedb3 3400 kmem_cache_reap_now(range_seg_cache);
ca67b33a
MA
3401
3402 if (zio_arena != NULL) {
3403 /*
3404 * Ask the vmem arena to reclaim unused memory from its
3405 * quantum caches.
3406 */
3407 vmem_qcache_reap(zio_arena);
3408 }
34dc7c2f
BB
3409}
3410
302f753f 3411/*
ca0bf58d
PS
3412 * Threads can block in arc_get_data_buf() waiting for this thread to evict
3413 * enough data and signal them to proceed. When this happens, the threads in
3414 * arc_get_data_buf() are sleeping while holding the hash lock for their
3415 * particular arc header. Thus, we must be careful to never sleep on a
3416 * hash lock in this thread. This is to prevent the following deadlock:
3417 *
3418 * - Thread A sleeps on CV in arc_get_data_buf() holding hash lock "L",
3419 * waiting for the reclaim thread to signal it.
3420 *
3421 * - arc_reclaim_thread() tries to acquire hash lock "L" using mutex_enter,
3422 * fails, and goes to sleep forever.
3423 *
3424 * This possible deadlock is avoided by always acquiring a hash lock
3425 * using mutex_tryenter() from arc_reclaim_thread().
302f753f 3426 */
34dc7c2f 3427static void
ca67b33a 3428arc_reclaim_thread(void)
34dc7c2f 3429{
ca67b33a
MA
3430 fstrans_cookie_t cookie = spl_fstrans_mark();
3431 clock_t growtime = 0;
34dc7c2f
BB
3432 callb_cpr_t cpr;
3433
ca0bf58d 3434 CALLB_CPR_INIT(&cpr, &arc_reclaim_lock, callb_generic_cpr, FTAG);
34dc7c2f 3435
ca0bf58d 3436 mutex_enter(&arc_reclaim_lock);
ca67b33a
MA
3437 while (!arc_reclaim_thread_exit) {
3438 int64_t to_free;
3439 int64_t free_memory = arc_available_memory();
3440 uint64_t evicted = 0;
302f753f 3441
ca67b33a 3442 arc_tuning_update();
34dc7c2f 3443
ca67b33a 3444 mutex_exit(&arc_reclaim_lock);
34dc7c2f 3445
ca67b33a 3446 if (free_memory < 0) {
34dc7c2f 3447
ca67b33a 3448 arc_no_grow = B_TRUE;
b128c09f 3449 arc_warm = B_TRUE;
34dc7c2f 3450
ca67b33a
MA
3451 /*
3452 * Wait at least zfs_grow_retry (default 5) seconds
3453 * before considering growing.
3454 */
3455 growtime = ddi_get_lbolt() + (arc_grow_retry * hz);
6a8f9b6b 3456
ca67b33a 3457 arc_kmem_reap_now();
34dc7c2f 3458
ca67b33a
MA
3459 /*
3460 * If we are still low on memory, shrink the ARC
3461 * so that we have arc_shrink_min free space.
3462 */
3463 free_memory = arc_available_memory();
34dc7c2f 3464
ca67b33a
MA
3465 to_free = (arc_c >> arc_shrink_shift) - free_memory;
3466 if (to_free > 0) {
3467#ifdef _KERNEL
3468 to_free = MAX(to_free, ptob(needfree));
3469#endif
3470 arc_shrink(to_free);
3471 }
3472 } else if (free_memory < arc_c >> arc_no_grow_shift) {
3473 arc_no_grow = B_TRUE;
3474 } else if (ddi_get_lbolt() >= growtime) {
3475 arc_no_grow = B_FALSE;
3476 }
bce45ec9 3477
ca67b33a 3478 evicted = arc_adjust();
bce45ec9 3479
ca67b33a 3480 mutex_enter(&arc_reclaim_lock);
bce45ec9 3481
ca67b33a
MA
3482 /*
3483 * If evicted is zero, we couldn't evict anything via
3484 * arc_adjust(). This could be due to hash lock
3485 * collisions, but more likely due to the majority of
3486 * arc buffers being unevictable. Therefore, even if
3487 * arc_size is above arc_c, another pass is unlikely to
3488 * be helpful and could potentially cause us to enter an
3489 * infinite loop.
3490 */
3491 if (arc_size <= arc_c || evicted == 0) {
3492 /*
3493 * We're either no longer overflowing, or we
3494 * can't evict anything more, so we should wake
3495 * up any threads before we go to sleep.
3496 */
3497 cv_broadcast(&arc_reclaim_waiters_cv);
bce45ec9 3498
ca67b33a
MA
3499 /*
3500 * Block until signaled, or after one second (we
3501 * might need to perform arc_kmem_reap_now()
3502 * even if we aren't being signalled)
3503 */
3504 CALLB_CPR_SAFE_BEGIN(&cpr);
3505 (void) cv_timedwait_sig(&arc_reclaim_thread_cv,
3506 &arc_reclaim_lock, ddi_get_lbolt() + hz);
3507 CALLB_CPR_SAFE_END(&cpr, &arc_reclaim_lock);
3508 }
ca0bf58d 3509 }
bce45ec9 3510
ca67b33a 3511 arc_reclaim_thread_exit = FALSE;
ca0bf58d
PS
3512 cv_broadcast(&arc_reclaim_thread_cv);
3513 CALLB_CPR_EXIT(&cpr); /* drops arc_reclaim_lock */
3514 spl_fstrans_unmark(cookie);
3515 thread_exit();
3516}
3517
3518static void
3519arc_user_evicts_thread(void)
3520{
ca67b33a 3521 fstrans_cookie_t cookie = spl_fstrans_mark();
ca0bf58d 3522 callb_cpr_t cpr;
bce45ec9 3523
ca0bf58d 3524 CALLB_CPR_INIT(&cpr, &arc_user_evicts_lock, callb_generic_cpr, FTAG);
bce45ec9 3525
ca0bf58d
PS
3526 mutex_enter(&arc_user_evicts_lock);
3527 while (!arc_user_evicts_thread_exit) {
3528 mutex_exit(&arc_user_evicts_lock);
3529
3530 arc_do_user_evicts();
3531
3532 /*
3533 * This is necessary in order for the mdb ::arc dcmd to
3534 * show up to date information. Since the ::arc command
3535 * does not call the kstat's update function, without
3536 * this call, the command may show stale stats for the
3537 * anon, mru, mru_ghost, mfu, and mfu_ghost lists. Even
3538 * with this change, the data might be up to 1 second
3539 * out of date; but that should suffice. The arc_state_t
3540 * structures can be queried directly if more accurate
3541 * information is needed.
3542 */
3543 if (arc_ksp != NULL)
3544 arc_ksp->ks_update(arc_ksp, KSTAT_READ);
3545
3546 mutex_enter(&arc_user_evicts_lock);
3547
3548 /*
3549 * Block until signaled, or after one second (we need to
3550 * call the arc's kstat update function regularly).
3551 */
3552 CALLB_CPR_SAFE_BEGIN(&cpr);
b64ccd6c 3553 (void) cv_timedwait_sig(&arc_user_evicts_cv,
ca0bf58d
PS
3554 &arc_user_evicts_lock, ddi_get_lbolt() + hz);
3555 CALLB_CPR_SAFE_END(&cpr, &arc_user_evicts_lock);
34dc7c2f
BB
3556 }
3557
ca0bf58d
PS
3558 arc_user_evicts_thread_exit = FALSE;
3559 cv_broadcast(&arc_user_evicts_cv);
3560 CALLB_CPR_EXIT(&cpr); /* drops arc_user_evicts_lock */
40d06e3c 3561 spl_fstrans_unmark(cookie);
34dc7c2f
BB
3562 thread_exit();
3563}
3564
7cb67b45
BB
3565#ifdef _KERNEL
3566/*
302f753f
BB
3567 * Determine the amount of memory eligible for eviction contained in the
3568 * ARC. All clean data reported by the ghost lists can always be safely
3569 * evicted. Due to arc_c_min, the same does not hold for all clean data
3570 * contained by the regular mru and mfu lists.
3571 *
3572 * In the case of the regular mru and mfu lists, we need to report as
3573 * much clean data as possible, such that evicting that same reported
3574 * data will not bring arc_size below arc_c_min. Thus, in certain
3575 * circumstances, the total amount of clean data in the mru and mfu
3576 * lists might not actually be evictable.
3577 *
3578 * The following two distinct cases are accounted for:
3579 *
3580 * 1. The sum of the amount of dirty data contained by both the mru and
3581 * mfu lists, plus the ARC's other accounting (e.g. the anon list),
3582 * is greater than or equal to arc_c_min.
3583 * (i.e. amount of dirty data >= arc_c_min)
3584 *
3585 * This is the easy case; all clean data contained by the mru and mfu
3586 * lists is evictable. Evicting all clean data can only drop arc_size
3587 * to the amount of dirty data, which is greater than arc_c_min.
3588 *
3589 * 2. The sum of the amount of dirty data contained by both the mru and
3590 * mfu lists, plus the ARC's other accounting (e.g. the anon list),
3591 * is less than arc_c_min.
3592 * (i.e. arc_c_min > amount of dirty data)
3593 *
3594 * 2.1. arc_size is greater than or equal arc_c_min.
3595 * (i.e. arc_size >= arc_c_min > amount of dirty data)
3596 *
3597 * In this case, not all clean data from the regular mru and mfu
3598 * lists is actually evictable; we must leave enough clean data
3599 * to keep arc_size above arc_c_min. Thus, the maximum amount of
3600 * evictable data from the two lists combined, is exactly the
3601 * difference between arc_size and arc_c_min.
3602 *
3603 * 2.2. arc_size is less than arc_c_min
3604 * (i.e. arc_c_min > arc_size > amount of dirty data)
3605 *
3606 * In this case, none of the data contained in the mru and mfu
3607 * lists is evictable, even if it's clean. Since arc_size is
3608 * already below arc_c_min, evicting any more would only
3609 * increase this negative difference.
7cb67b45 3610 */
302f753f
BB
3611static uint64_t
3612arc_evictable_memory(void) {
3613 uint64_t arc_clean =
3614 arc_mru->arcs_lsize[ARC_BUFC_DATA] +
3615 arc_mru->arcs_lsize[ARC_BUFC_METADATA] +
3616 arc_mfu->arcs_lsize[ARC_BUFC_DATA] +
3617 arc_mfu->arcs_lsize[ARC_BUFC_METADATA];
3618 uint64_t ghost_clean =
3619 arc_mru_ghost->arcs_lsize[ARC_BUFC_DATA] +
3620 arc_mru_ghost->arcs_lsize[ARC_BUFC_METADATA] +
3621 arc_mfu_ghost->arcs_lsize[ARC_BUFC_DATA] +
3622 arc_mfu_ghost->arcs_lsize[ARC_BUFC_METADATA];
3623 uint64_t arc_dirty = MAX((int64_t)arc_size - (int64_t)arc_clean, 0);
3624
3625 if (arc_dirty >= arc_c_min)
3626 return (ghost_clean + arc_clean);
3627
3628 return (ghost_clean + MAX((int64_t)arc_size - (int64_t)arc_c_min, 0));
3629}
3630
ed6e9cc2
TC
3631/*
3632 * If sc->nr_to_scan is zero, the caller is requesting a query of the
3633 * number of objects which can potentially be freed. If it is nonzero,
3634 * the request is to free that many objects.
3635 *
3636 * Linux kernels >= 3.12 have the count_objects and scan_objects callbacks
3637 * in struct shrinker and also require the shrinker to return the number
3638 * of objects freed.
3639 *
3640 * Older kernels require the shrinker to return the number of freeable
3641 * objects following the freeing of nr_to_free.
3642 */
3643static spl_shrinker_t
7e7baeca 3644__arc_shrinker_func(struct shrinker *shrink, struct shrink_control *sc)
7cb67b45 3645{
ed6e9cc2 3646 int64_t pages;
7cb67b45 3647
302f753f
BB
3648 /* The arc is considered warm once reclaim has occurred */
3649 if (unlikely(arc_warm == B_FALSE))
3650 arc_warm = B_TRUE;
7cb67b45 3651
302f753f 3652 /* Return the potential number of reclaimable pages */
ed6e9cc2 3653 pages = btop((int64_t)arc_evictable_memory());
302f753f
BB
3654 if (sc->nr_to_scan == 0)
3655 return (pages);
3fd70ee6
BB
3656
3657 /* Not allowed to perform filesystem reclaim */
7e7baeca 3658 if (!(sc->gfp_mask & __GFP_FS))
ed6e9cc2 3659 return (SHRINK_STOP);
3fd70ee6 3660
7cb67b45 3661 /* Reclaim in progress */
ca0bf58d 3662 if (mutex_tryenter(&arc_reclaim_lock) == 0)
ed6e9cc2 3663 return (SHRINK_STOP);
7cb67b45 3664
ca0bf58d
PS
3665 mutex_exit(&arc_reclaim_lock);
3666
302f753f
BB
3667 /*
3668 * Evict the requested number of pages by shrinking arc_c the
3669 * requested amount. If there is nothing left to evict just
3670 * reap whatever we can from the various arc slabs.
3671 */
3672 if (pages > 0) {
ca67b33a
MA
3673 arc_shrink(ptob(sc->nr_to_scan));
3674 arc_kmem_reap_now();
ed6e9cc2
TC
3675#ifdef HAVE_SPLIT_SHRINKER_CALLBACK
3676 pages = MAX(pages - btop(arc_evictable_memory()), 0);
3677#else
1e3cb67b 3678 pages = btop(arc_evictable_memory());
ed6e9cc2 3679#endif
302f753f 3680 } else {
ca67b33a 3681 arc_kmem_reap_now();
ed6e9cc2 3682 pages = SHRINK_STOP;
302f753f
BB
3683 }
3684
ca0bf58d
PS
3685 /*
3686 * We've reaped what we can, wake up threads.
3687 */
3688 cv_broadcast(&arc_reclaim_waiters_cv);
3689
302f753f
BB
3690 /*
3691 * When direct reclaim is observed it usually indicates a rapid
3692 * increase in memory pressure. This occurs because the kswapd
3693 * threads were unable to asynchronously keep enough free memory
3694 * available. In this case set arc_no_grow to briefly pause arc
3695 * growth to avoid compounding the memory pressure.
3696 */
7cb67b45 3697 if (current_is_kswapd()) {
302f753f 3698 ARCSTAT_BUMP(arcstat_memory_indirect_count);
7cb67b45 3699 } else {
302f753f 3700 arc_no_grow = B_TRUE;
bce45ec9 3701 arc_grow_time = ddi_get_lbolt() + (zfs_arc_grow_retry * hz);
302f753f 3702 ARCSTAT_BUMP(arcstat_memory_direct_count);
7cb67b45
BB
3703 }
3704
1e3cb67b 3705 return (pages);
7cb67b45 3706}
7e7baeca 3707SPL_SHRINKER_CALLBACK_WRAPPER(arc_shrinker_func);
7cb67b45
BB
3708
3709SPL_SHRINKER_DECLARE(arc_shrinker, arc_shrinker_func, DEFAULT_SEEKS);
3710#endif /* _KERNEL */
3711
34dc7c2f
BB
3712/*
3713 * Adapt arc info given the number of bytes we are trying to add and
3714 * the state that we are comming from. This function is only called
3715 * when we are adding new content to the cache.
3716 */
3717static void
3718arc_adapt(int bytes, arc_state_t *state)
3719{
3720 int mult;
36da08ef
PS
3721 int64_t mrug_size = refcount_count(&arc_mru_ghost->arcs_size);
3722 int64_t mfug_size = refcount_count(&arc_mfu_ghost->arcs_size);
34dc7c2f
BB
3723
3724 if (state == arc_l2c_only)
3725 return;
3726
3727 ASSERT(bytes > 0);
3728 /*
3729 * Adapt the target size of the MRU list:
3730 * - if we just hit in the MRU ghost list, then increase
3731 * the target size of the MRU list.
3732 * - if we just hit in the MFU ghost list, then increase
3733 * the target size of the MFU list by decreasing the
3734 * target size of the MRU list.
3735 */
3736 if (state == arc_mru_ghost) {
36da08ef 3737 mult = (mrug_size >= mfug_size) ? 1 : (mfug_size / mrug_size);
62422785
PS
3738 if (!zfs_arc_p_dampener_disable)
3739 mult = MIN(mult, 10); /* avoid wild arc_p adjustment */
34dc7c2f 3740
f521ce1b 3741 arc_p = MIN(arc_c, arc_p + bytes * mult);
34dc7c2f 3742 } else if (state == arc_mfu_ghost) {
d164b209
BB
3743 uint64_t delta;
3744
36da08ef 3745 mult = (mfug_size >= mrug_size) ? 1 : (mrug_size / mfug_size);
62422785
PS
3746 if (!zfs_arc_p_dampener_disable)
3747 mult = MIN(mult, 10);
34dc7c2f 3748
d164b209 3749 delta = MIN(bytes * mult, arc_p);
f521ce1b 3750 arc_p = MAX(0, arc_p - delta);
34dc7c2f
BB
3751 }
3752 ASSERT((int64_t)arc_p >= 0);
3753
ca67b33a
MA
3754 if (arc_reclaim_needed()) {
3755 cv_signal(&arc_reclaim_thread_cv);
3756 return;
3757 }
3758
34dc7c2f
BB
3759 if (arc_no_grow)
3760 return;
3761
3762 if (arc_c >= arc_c_max)
3763 return;
3764
3765 /*
3766 * If we're within (2 * maxblocksize) bytes of the target
3767 * cache size, increment the target cache size
3768 */
121b3cae
TC
3769 VERIFY3U(arc_c, >=, 2ULL << SPA_MAXBLOCKSHIFT);
3770 if (arc_size >= arc_c - (2ULL << SPA_MAXBLOCKSHIFT)) {
34dc7c2f
BB
3771 atomic_add_64(&arc_c, (int64_t)bytes);
3772 if (arc_c > arc_c_max)
3773 arc_c = arc_c_max;
3774 else if (state == arc_anon)
3775 atomic_add_64(&arc_p, (int64_t)bytes);
3776 if (arc_p > arc_c)
3777 arc_p = arc_c;
3778 }
3779 ASSERT((int64_t)arc_p >= 0);
3780}
3781
3782/*
ca0bf58d
PS
3783 * Check if arc_size has grown past our upper threshold, determined by
3784 * zfs_arc_overflow_shift.
34dc7c2f 3785 */
ca0bf58d
PS
3786static boolean_t
3787arc_is_overflowing(void)
34dc7c2f 3788{
ca0bf58d
PS
3789 /* Always allow at least one block of overflow */
3790 uint64_t overflow = MAX(SPA_MAXBLOCKSIZE,
3791 arc_c >> zfs_arc_overflow_shift);
34dc7c2f 3792
ca0bf58d 3793 return (arc_size >= arc_c + overflow);
34dc7c2f
BB
3794}
3795
3796/*
ca0bf58d
PS
3797 * The buffer, supplied as the first argument, needs a data block. If we
3798 * are hitting the hard limit for the cache size, we must sleep, waiting
3799 * for the eviction thread to catch up. If we're past the target size
3800 * but below the hard limit, we'll only signal the reclaim thread and
3801 * continue on.
34dc7c2f
BB
3802 */
3803static void
3804arc_get_data_buf(arc_buf_t *buf)
3805{
b9541d6b 3806 arc_state_t *state = buf->b_hdr->b_l1hdr.b_state;
34dc7c2f 3807 uint64_t size = buf->b_hdr->b_size;
b9541d6b 3808 arc_buf_contents_t type = arc_buf_type(buf->b_hdr);
34dc7c2f
BB
3809
3810 arc_adapt(size, state);
3811
3812 /*
ca0bf58d
PS
3813 * If arc_size is currently overflowing, and has grown past our
3814 * upper limit, we must be adding data faster than the evict
3815 * thread can evict. Thus, to ensure we don't compound the
3816 * problem by adding more data and forcing arc_size to grow even
3817 * further past it's target size, we halt and wait for the
3818 * eviction thread to catch up.
3819 *
3820 * It's also possible that the reclaim thread is unable to evict
3821 * enough buffers to get arc_size below the overflow limit (e.g.
3822 * due to buffers being un-evictable, or hash lock collisions).
3823 * In this case, we want to proceed regardless if we're
3824 * overflowing; thus we don't use a while loop here.
34dc7c2f 3825 */
ca0bf58d
PS
3826 if (arc_is_overflowing()) {
3827 mutex_enter(&arc_reclaim_lock);
3828
3829 /*
3830 * Now that we've acquired the lock, we may no longer be
3831 * over the overflow limit, lets check.
3832 *
3833 * We're ignoring the case of spurious wake ups. If that
3834 * were to happen, it'd let this thread consume an ARC
3835 * buffer before it should have (i.e. before we're under
3836 * the overflow limit and were signalled by the reclaim
3837 * thread). As long as that is a rare occurrence, it
3838 * shouldn't cause any harm.
3839 */
3840 if (arc_is_overflowing()) {
3841 cv_signal(&arc_reclaim_thread_cv);
3842 cv_wait(&arc_reclaim_waiters_cv, &arc_reclaim_lock);
34dc7c2f 3843 }
34dc7c2f 3844
ca0bf58d 3845 mutex_exit(&arc_reclaim_lock);
34dc7c2f 3846 }
ab26409d 3847
da8ccd0e 3848 if (type == ARC_BUFC_METADATA) {
ca0bf58d
PS
3849 buf->b_data = zio_buf_alloc(size);
3850 arc_space_consume(size, ARC_SPACE_META);
3851 } else {
3852 ASSERT(type == ARC_BUFC_DATA);
3853 buf->b_data = zio_data_buf_alloc(size);
3854 arc_space_consume(size, ARC_SPACE_DATA);
da8ccd0e
PS
3855 }
3856
34dc7c2f
BB
3857 /*
3858 * Update the state size. Note that ghost states have a
3859 * "ghost size" and so don't need to be updated.
3860 */
b9541d6b 3861 if (!GHOST_STATE(buf->b_hdr->b_l1hdr.b_state)) {
34dc7c2f 3862 arc_buf_hdr_t *hdr = buf->b_hdr;
36da08ef 3863 arc_state_t *state = hdr->b_l1hdr.b_state;
34dc7c2f 3864
36da08ef 3865 (void) refcount_add_many(&state->arcs_size, size, buf);
ca0bf58d
PS
3866
3867 /*
3868 * If this is reached via arc_read, the link is
3869 * protected by the hash lock. If reached via
3870 * arc_buf_alloc, the header should not be accessed by
3871 * any other thread. And, if reached via arc_read_done,
3872 * the hash lock will protect it if it's found in the
3873 * hash table; otherwise no other thread should be
3874 * trying to [add|remove]_reference it.
3875 */
3876 if (multilist_link_active(&hdr->b_l1hdr.b_arc_node)) {
b9541d6b
CW
3877 ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
3878 atomic_add_64(&hdr->b_l1hdr.b_state->arcs_lsize[type],
3879 size);
34dc7c2f
BB
3880 }
3881 /*
3882 * If we are growing the cache, and we are adding anonymous
3883 * data, and we have outgrown arc_p, update arc_p
3884 */
ca0bf58d 3885 if (arc_size < arc_c && hdr->b_l1hdr.b_state == arc_anon &&
36da08ef
PS
3886 (refcount_count(&arc_anon->arcs_size) +
3887 refcount_count(&arc_mru->arcs_size) > arc_p))
34dc7c2f
BB
3888 arc_p = MIN(arc_c, arc_p + size);
3889 }
3890}
3891
3892/*
3893 * This routine is called whenever a buffer is accessed.
3894 * NOTE: the hash lock is dropped in this function.
3895 */
3896static void
2a432414 3897arc_access(arc_buf_hdr_t *hdr, kmutex_t *hash_lock)
34dc7c2f 3898{
428870ff
BB
3899 clock_t now;
3900
34dc7c2f 3901 ASSERT(MUTEX_HELD(hash_lock));
b9541d6b 3902 ASSERT(HDR_HAS_L1HDR(hdr));
34dc7c2f 3903
b9541d6b 3904 if (hdr->b_l1hdr.b_state == arc_anon) {
34dc7c2f
BB
3905 /*
3906 * This buffer is not in the cache, and does not
3907 * appear in our "ghost" list. Add the new buffer
3908 * to the MRU state.
3909 */
3910
b9541d6b
CW
3911 ASSERT0(hdr->b_l1hdr.b_arc_access);
3912 hdr->b_l1hdr.b_arc_access = ddi_get_lbolt();
2a432414
GW
3913 DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, hdr);
3914 arc_change_state(arc_mru, hdr, hash_lock);
34dc7c2f 3915
b9541d6b 3916 } else if (hdr->b_l1hdr.b_state == arc_mru) {
428870ff
BB
3917 now = ddi_get_lbolt();
3918
34dc7c2f
BB
3919 /*
3920 * If this buffer is here because of a prefetch, then either:
3921 * - clear the flag if this is a "referencing" read
3922 * (any subsequent access will bump this into the MFU state).
3923 * or
3924 * - move the buffer to the head of the list if this is
3925 * another prefetch (to make it less likely to be evicted).
3926 */
b9541d6b
CW
3927 if (HDR_PREFETCH(hdr)) {
3928 if (refcount_count(&hdr->b_l1hdr.b_refcnt) == 0) {
ca0bf58d
PS
3929 /* link protected by hash lock */
3930 ASSERT(multilist_link_active(
b9541d6b 3931 &hdr->b_l1hdr.b_arc_node));
34dc7c2f 3932 } else {
2a432414 3933 hdr->b_flags &= ~ARC_FLAG_PREFETCH;
b9541d6b 3934 atomic_inc_32(&hdr->b_l1hdr.b_mru_hits);
34dc7c2f
BB
3935 ARCSTAT_BUMP(arcstat_mru_hits);
3936 }
b9541d6b 3937 hdr->b_l1hdr.b_arc_access = now;
34dc7c2f
BB
3938 return;
3939 }
3940
3941 /*
3942 * This buffer has been "accessed" only once so far,
3943 * but it is still in the cache. Move it to the MFU
3944 * state.
3945 */
b9541d6b
CW
3946 if (ddi_time_after(now, hdr->b_l1hdr.b_arc_access +
3947 ARC_MINTIME)) {
34dc7c2f
BB
3948 /*
3949 * More than 125ms have passed since we
3950 * instantiated this buffer. Move it to the
3951 * most frequently used state.
3952 */
b9541d6b 3953 hdr->b_l1hdr.b_arc_access = now;
2a432414
GW
3954 DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr);
3955 arc_change_state(arc_mfu, hdr, hash_lock);
34dc7c2f 3956 }
b9541d6b 3957 atomic_inc_32(&hdr->b_l1hdr.b_mru_hits);
34dc7c2f 3958 ARCSTAT_BUMP(arcstat_mru_hits);
b9541d6b 3959 } else if (hdr->b_l1hdr.b_state == arc_mru_ghost) {
34dc7c2f
BB
3960 arc_state_t *new_state;
3961 /*
3962 * This buffer has been "accessed" recently, but
3963 * was evicted from the cache. Move it to the
3964 * MFU state.
3965 */
3966
b9541d6b 3967 if (HDR_PREFETCH(hdr)) {
34dc7c2f 3968 new_state = arc_mru;
b9541d6b 3969 if (refcount_count(&hdr->b_l1hdr.b_refcnt) > 0)
2a432414
GW
3970 hdr->b_flags &= ~ARC_FLAG_PREFETCH;
3971 DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, hdr);
34dc7c2f
BB
3972 } else {
3973 new_state = arc_mfu;
2a432414 3974 DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr);
34dc7c2f
BB
3975 }
3976
b9541d6b 3977 hdr->b_l1hdr.b_arc_access = ddi_get_lbolt();
2a432414 3978 arc_change_state(new_state, hdr, hash_lock);
34dc7c2f 3979
b9541d6b 3980 atomic_inc_32(&hdr->b_l1hdr.b_mru_ghost_hits);
34dc7c2f 3981 ARCSTAT_BUMP(arcstat_mru_ghost_hits);
b9541d6b 3982 } else if (hdr->b_l1hdr.b_state == arc_mfu) {
34dc7c2f
BB
3983 /*
3984 * This buffer has been accessed more than once and is
3985 * still in the cache. Keep it in the MFU state.
3986 *
3987 * NOTE: an add_reference() that occurred when we did
3988 * the arc_read() will have kicked this off the list.
3989 * If it was a prefetch, we will explicitly move it to
3990 * the head of the list now.
3991 */
b9541d6b
CW
3992 if ((HDR_PREFETCH(hdr)) != 0) {
3993 ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
ca0bf58d
PS
3994 /* link protected by hash_lock */
3995 ASSERT(multilist_link_active(&hdr->b_l1hdr.b_arc_node));
34dc7c2f 3996 }
b9541d6b 3997 atomic_inc_32(&hdr->b_l1hdr.b_mfu_hits);
34dc7c2f 3998 ARCSTAT_BUMP(arcstat_mfu_hits);
b9541d6b
CW
3999 hdr->b_l1hdr.b_arc_access = ddi_get_lbolt();
4000 } else if (hdr->b_l1hdr.b_state == arc_mfu_ghost) {
34dc7c2f
BB
4001 arc_state_t *new_state = arc_mfu;
4002 /*
4003 * This buffer has been accessed more than once but has
4004 * been evicted from the cache. Move it back to the
4005 * MFU state.
4006 */
4007
b9541d6b 4008 if (HDR_PREFETCH(hdr)) {
34dc7c2f
BB
4009 /*
4010 * This is a prefetch access...
4011 * move this block back to the MRU state.
4012 */
b9541d6b 4013 ASSERT0(refcount_count(&hdr->b_l1hdr.b_refcnt));
34dc7c2f
BB
4014 new_state = arc_mru;
4015 }
4016
b9541d6b 4017 hdr->b_l1hdr.b_arc_access = ddi_get_lbolt();
2a432414
GW
4018 DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr);
4019 arc_change_state(new_state, hdr, hash_lock);
34dc7c2f 4020
b9541d6b 4021 atomic_inc_32(&hdr->b_l1hdr.b_mfu_ghost_hits);
34dc7c2f 4022 ARCSTAT_BUMP(arcstat_mfu_ghost_hits);
b9541d6b 4023 } else if (hdr->b_l1hdr.b_state == arc_l2c_only) {
34dc7c2f
BB
4024 /*
4025 * This buffer is on the 2nd Level ARC.
4026 */
4027
b9541d6b 4028 hdr->b_l1hdr.b_arc_access = ddi_get_lbolt();
2a432414
GW
4029 DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr);
4030 arc_change_state(arc_mfu, hdr, hash_lock);
34dc7c2f 4031 } else {
b9541d6b
CW
4032 cmn_err(CE_PANIC, "invalid arc state 0x%p",
4033 hdr->b_l1hdr.b_state);
34dc7c2f
BB
4034 }
4035}
4036
4037/* a generic arc_done_func_t which you can use */
4038/* ARGSUSED */
4039void
4040arc_bcopy_func(zio_t *zio, arc_buf_t *buf, void *arg)
4041{
428870ff
BB
4042 if (zio == NULL || zio->io_error == 0)
4043 bcopy(buf->b_data, arg, buf->b_hdr->b_size);
13fe0198 4044 VERIFY(arc_buf_remove_ref(buf, arg));
34dc7c2f
BB
4045}
4046
4047/* a generic arc_done_func_t */
4048void
4049arc_getbuf_func(zio_t *zio, arc_buf_t *buf, void *arg)
4050{
4051 arc_buf_t **bufp = arg;
4052 if (zio && zio->io_error) {
13fe0198 4053 VERIFY(arc_buf_remove_ref(buf, arg));
34dc7c2f
BB
4054 *bufp = NULL;
4055 } else {
4056 *bufp = buf;
428870ff 4057 ASSERT(buf->b_data);
34dc7c2f
BB
4058 }
4059}
4060
4061static void
4062arc_read_done(zio_t *zio)
4063{
9b67f605 4064 arc_buf_hdr_t *hdr;
34dc7c2f
BB
4065 arc_buf_t *buf;
4066 arc_buf_t *abuf; /* buffer we're assigning to callback */
9b67f605 4067 kmutex_t *hash_lock = NULL;
34dc7c2f
BB
4068 arc_callback_t *callback_list, *acb;
4069 int freeable = FALSE;
4070
4071 buf = zio->io_private;
4072 hdr = buf->b_hdr;
4073
4074 /*
4075 * The hdr was inserted into hash-table and removed from lists
4076 * prior to starting I/O. We should find this header, since
4077 * it's in the hash table, and it should be legit since it's
4078 * not possible to evict it during the I/O. The only possible
4079 * reason for it not to be found is if we were freed during the
4080 * read.
4081 */
9b67f605
MA
4082 if (HDR_IN_HASH_TABLE(hdr)) {
4083 arc_buf_hdr_t *found;
4084
4085 ASSERT3U(hdr->b_birth, ==, BP_PHYSICAL_BIRTH(zio->io_bp));
4086 ASSERT3U(hdr->b_dva.dva_word[0], ==,
4087 BP_IDENTITY(zio->io_bp)->dva_word[0]);
4088 ASSERT3U(hdr->b_dva.dva_word[1], ==,
4089 BP_IDENTITY(zio->io_bp)->dva_word[1]);
4090
4091 found = buf_hash_find(hdr->b_spa, zio->io_bp,
4092 &hash_lock);
4093
4094 ASSERT((found == NULL && HDR_FREED_IN_READ(hdr) &&
4095 hash_lock == NULL) ||
4096 (found == hdr &&
4097 DVA_EQUAL(&hdr->b_dva, BP_IDENTITY(zio->io_bp))) ||
4098 (found == hdr && HDR_L2_READING(hdr)));
4099 }
34dc7c2f 4100
2a432414 4101 hdr->b_flags &= ~ARC_FLAG_L2_EVICTED;
b9541d6b 4102 if (l2arc_noprefetch && HDR_PREFETCH(hdr))
2a432414 4103 hdr->b_flags &= ~ARC_FLAG_L2CACHE;
34dc7c2f
BB
4104
4105 /* byteswap if necessary */
b9541d6b 4106 callback_list = hdr->b_l1hdr.b_acb;
34dc7c2f 4107 ASSERT(callback_list != NULL);
428870ff 4108 if (BP_SHOULD_BYTESWAP(zio->io_bp) && zio->io_error == 0) {
9ae529ec
CS
4109 dmu_object_byteswap_t bswap =
4110 DMU_OT_BYTESWAP(BP_GET_TYPE(zio->io_bp));
b01615d5
RY
4111 if (BP_GET_LEVEL(zio->io_bp) > 0)
4112 byteswap_uint64_array(buf->b_data, hdr->b_size);
4113 else
4114 dmu_ot_byteswap[bswap].ob_func(buf->b_data, hdr->b_size);
b128c09f 4115 }
34dc7c2f
BB
4116
4117 arc_cksum_compute(buf, B_FALSE);
498877ba 4118 arc_buf_watch(buf);
34dc7c2f 4119
b9541d6b
CW
4120 if (hash_lock && zio->io_error == 0 &&
4121 hdr->b_l1hdr.b_state == arc_anon) {
428870ff
BB
4122 /*
4123 * Only call arc_access on anonymous buffers. This is because
4124 * if we've issued an I/O for an evicted buffer, we've already
4125 * called arc_access (to prevent any simultaneous readers from
4126 * getting confused).
4127 */
4128 arc_access(hdr, hash_lock);
4129 }
4130
34dc7c2f
BB
4131 /* create copies of the data buffer for the callers */
4132 abuf = buf;
4133 for (acb = callback_list; acb; acb = acb->acb_next) {
4134 if (acb->acb_done) {
1eb5bfa3
GW
4135 if (abuf == NULL) {
4136 ARCSTAT_BUMP(arcstat_duplicate_reads);
34dc7c2f 4137 abuf = arc_buf_clone(buf);
1eb5bfa3 4138 }
34dc7c2f
BB
4139 acb->acb_buf = abuf;
4140 abuf = NULL;
4141 }
4142 }
b9541d6b 4143 hdr->b_l1hdr.b_acb = NULL;
2a432414 4144 hdr->b_flags &= ~ARC_FLAG_IO_IN_PROGRESS;
34dc7c2f 4145 ASSERT(!HDR_BUF_AVAILABLE(hdr));
428870ff
BB
4146 if (abuf == buf) {
4147 ASSERT(buf->b_efunc == NULL);
b9541d6b 4148 ASSERT(hdr->b_l1hdr.b_datacnt == 1);
2a432414 4149 hdr->b_flags |= ARC_FLAG_BUF_AVAILABLE;
428870ff 4150 }
34dc7c2f 4151
b9541d6b
CW
4152 ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt) ||
4153 callback_list != NULL);
34dc7c2f
BB
4154
4155 if (zio->io_error != 0) {
2a432414 4156 hdr->b_flags |= ARC_FLAG_IO_ERROR;
b9541d6b 4157 if (hdr->b_l1hdr.b_state != arc_anon)
34dc7c2f
BB
4158 arc_change_state(arc_anon, hdr, hash_lock);
4159 if (HDR_IN_HASH_TABLE(hdr))
4160 buf_hash_remove(hdr);
b9541d6b 4161 freeable = refcount_is_zero(&hdr->b_l1hdr.b_refcnt);
34dc7c2f
BB
4162 }
4163
4164 /*
4165 * Broadcast before we drop the hash_lock to avoid the possibility
4166 * that the hdr (and hence the cv) might be freed before we get to
4167 * the cv_broadcast().
4168 */
b9541d6b 4169 cv_broadcast(&hdr->b_l1hdr.b_cv);
34dc7c2f 4170
b9541d6b 4171 if (hash_lock != NULL) {
34dc7c2f
BB
4172 mutex_exit(hash_lock);
4173 } else {
4174 /*
4175 * This block was freed while we waited for the read to
4176 * complete. It has been removed from the hash table and
4177 * moved to the anonymous state (so that it won't show up
4178 * in the cache).
4179 */
b9541d6b
CW
4180 ASSERT3P(hdr->b_l1hdr.b_state, ==, arc_anon);
4181 freeable = refcount_is_zero(&hdr->b_l1hdr.b_refcnt);
34dc7c2f
BB
4182 }
4183
4184 /* execute each callback and free its structure */
4185 while ((acb = callback_list) != NULL) {
4186 if (acb->acb_done)
4187 acb->acb_done(zio, acb->acb_buf, acb->acb_private);
4188
4189 if (acb->acb_zio_dummy != NULL) {
4190 acb->acb_zio_dummy->io_error = zio->io_error;
4191 zio_nowait(acb->acb_zio_dummy);
4192 }
4193
4194 callback_list = acb->acb_next;
4195 kmem_free(acb, sizeof (arc_callback_t));
4196 }
4197
4198 if (freeable)
4199 arc_hdr_destroy(hdr);
4200}
4201
4202/*
5c839890 4203 * "Read" the block at the specified DVA (in bp) via the
34dc7c2f
BB
4204 * cache. If the block is found in the cache, invoke the provided
4205 * callback immediately and return. Note that the `zio' parameter
4206 * in the callback will be NULL in this case, since no IO was
4207 * required. If the block is not in the cache pass the read request
4208 * on to the spa with a substitute callback function, so that the
4209 * requested block will be added to the cache.
4210 *
4211 * If a read request arrives for a block that has a read in-progress,
4212 * either wait for the in-progress read to complete (and return the
4213 * results); or, if this is a read with a "done" func, add a record
4214 * to the read to invoke the "done" func when the read completes,
4215 * and return; or just return.
4216 *
4217 * arc_read_done() will invoke all the requested "done" functions
4218 * for readers of this block.
4219 */
4220int
294f6806 4221arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_done_func_t *done,
2a432414
GW
4222 void *private, zio_priority_t priority, int zio_flags,
4223 arc_flags_t *arc_flags, const zbookmark_phys_t *zb)
34dc7c2f 4224{
9b67f605 4225 arc_buf_hdr_t *hdr = NULL;
d4ed6673 4226 arc_buf_t *buf = NULL;
9b67f605 4227 kmutex_t *hash_lock = NULL;
34dc7c2f 4228 zio_t *rzio;
3541dc6d 4229 uint64_t guid = spa_load_guid(spa);
1421c891 4230 int rc = 0;
34dc7c2f 4231
9b67f605
MA
4232 ASSERT(!BP_IS_EMBEDDED(bp) ||
4233 BPE_GET_ETYPE(bp) == BP_EMBEDDED_TYPE_DATA);
4234
34dc7c2f 4235top:
9b67f605
MA
4236 if (!BP_IS_EMBEDDED(bp)) {
4237 /*
4238 * Embedded BP's have no DVA and require no I/O to "read".
4239 * Create an anonymous arc buf to back it.
4240 */
4241 hdr = buf_hash_find(guid, bp, &hash_lock);
4242 }
4243
b9541d6b 4244 if (hdr != NULL && HDR_HAS_L1HDR(hdr) && hdr->b_l1hdr.b_datacnt > 0) {
34dc7c2f 4245
2a432414 4246 *arc_flags |= ARC_FLAG_CACHED;
34dc7c2f
BB
4247
4248 if (HDR_IO_IN_PROGRESS(hdr)) {
4249
2a432414 4250 if (*arc_flags & ARC_FLAG_WAIT) {
b9541d6b 4251 cv_wait(&hdr->b_l1hdr.b_cv, hash_lock);
34dc7c2f
BB
4252 mutex_exit(hash_lock);
4253 goto top;
4254 }
2a432414 4255 ASSERT(*arc_flags & ARC_FLAG_NOWAIT);
34dc7c2f
BB
4256
4257 if (done) {
4258 arc_callback_t *acb = NULL;
4259
4260 acb = kmem_zalloc(sizeof (arc_callback_t),
79c76d5b 4261 KM_SLEEP);
34dc7c2f
BB
4262 acb->acb_done = done;
4263 acb->acb_private = private;
34dc7c2f
BB
4264 if (pio != NULL)
4265 acb->acb_zio_dummy = zio_null(pio,
d164b209 4266 spa, NULL, NULL, NULL, zio_flags);
34dc7c2f
BB
4267
4268 ASSERT(acb->acb_done != NULL);
b9541d6b
CW
4269 acb->acb_next = hdr->b_l1hdr.b_acb;
4270 hdr->b_l1hdr.b_acb = acb;
34dc7c2f
BB
4271 add_reference(hdr, hash_lock, private);
4272 mutex_exit(hash_lock);
1421c891 4273 goto out;
34dc7c2f
BB
4274 }
4275 mutex_exit(hash_lock);
1421c891 4276 goto out;
34dc7c2f
BB
4277 }
4278
b9541d6b
CW
4279 ASSERT(hdr->b_l1hdr.b_state == arc_mru ||
4280 hdr->b_l1hdr.b_state == arc_mfu);
34dc7c2f
BB
4281
4282 if (done) {
4283 add_reference(hdr, hash_lock, private);
4284 /*
4285 * If this block is already in use, create a new
4286 * copy of the data so that we will be guaranteed
4287 * that arc_release() will always succeed.
4288 */
b9541d6b 4289 buf = hdr->b_l1hdr.b_buf;
34dc7c2f
BB
4290 ASSERT(buf);
4291 ASSERT(buf->b_data);
4292 if (HDR_BUF_AVAILABLE(hdr)) {
4293 ASSERT(buf->b_efunc == NULL);
2a432414 4294 hdr->b_flags &= ~ARC_FLAG_BUF_AVAILABLE;
34dc7c2f
BB
4295 } else {
4296 buf = arc_buf_clone(buf);
4297 }
428870ff 4298
2a432414 4299 } else if (*arc_flags & ARC_FLAG_PREFETCH &&
b9541d6b 4300 refcount_count(&hdr->b_l1hdr.b_refcnt) == 0) {
2a432414 4301 hdr->b_flags |= ARC_FLAG_PREFETCH;
34dc7c2f
BB
4302 }
4303 DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr);
4304 arc_access(hdr, hash_lock);
2a432414
GW
4305 if (*arc_flags & ARC_FLAG_L2CACHE)
4306 hdr->b_flags |= ARC_FLAG_L2CACHE;
4307 if (*arc_flags & ARC_FLAG_L2COMPRESS)
4308 hdr->b_flags |= ARC_FLAG_L2COMPRESS;
34dc7c2f
BB
4309 mutex_exit(hash_lock);
4310 ARCSTAT_BUMP(arcstat_hits);
b9541d6b
CW
4311 ARCSTAT_CONDSTAT(!HDR_PREFETCH(hdr),
4312 demand, prefetch, !HDR_ISTYPE_METADATA(hdr),
34dc7c2f
BB
4313 data, metadata, hits);
4314
4315 if (done)
4316 done(NULL, buf, private);
4317 } else {
4318 uint64_t size = BP_GET_LSIZE(bp);
9b67f605 4319 arc_callback_t *acb;
b128c09f 4320 vdev_t *vd = NULL;
a117a6d6 4321 uint64_t addr = 0;
d164b209 4322 boolean_t devw = B_FALSE;
0ed212dc 4323 enum zio_compress b_compress = ZIO_COMPRESS_OFF;
b9541d6b 4324 int32_t b_asize = 0;
34dc7c2f 4325
5f6d0b6f
BB
4326 /*
4327 * Gracefully handle a damaged logical block size as a
4328 * checksum error by passing a dummy zio to the done callback.
4329 */
f1512ee6 4330 if (size > spa_maxblocksize(spa)) {
5f6d0b6f
BB
4331 if (done) {
4332 rzio = zio_null(pio, spa, NULL,
4333 NULL, NULL, zio_flags);
4334 rzio->io_error = ECKSUM;
4335 done(rzio, buf, private);
4336 zio_nowait(rzio);
4337 }
4338 rc = ECKSUM;
4339 goto out;
4340 }
4341
34dc7c2f
BB
4342 if (hdr == NULL) {
4343 /* this block is not in the cache */
9b67f605 4344 arc_buf_hdr_t *exists = NULL;
34dc7c2f
BB
4345 arc_buf_contents_t type = BP_GET_BUFC_TYPE(bp);
4346 buf = arc_buf_alloc(spa, size, private, type);
4347 hdr = buf->b_hdr;
9b67f605
MA
4348 if (!BP_IS_EMBEDDED(bp)) {
4349 hdr->b_dva = *BP_IDENTITY(bp);
4350 hdr->b_birth = BP_PHYSICAL_BIRTH(bp);
9b67f605
MA
4351 exists = buf_hash_insert(hdr, &hash_lock);
4352 }
4353 if (exists != NULL) {
34dc7c2f
BB
4354 /* somebody beat us to the hash insert */
4355 mutex_exit(hash_lock);
428870ff 4356 buf_discard_identity(hdr);
34dc7c2f
BB
4357 (void) arc_buf_remove_ref(buf, private);
4358 goto top; /* restart the IO request */
4359 }
2a432414 4360
34dc7c2f 4361 /* if this is a prefetch, we don't have a reference */
2a432414 4362 if (*arc_flags & ARC_FLAG_PREFETCH) {
34dc7c2f
BB
4363 (void) remove_reference(hdr, hash_lock,
4364 private);
2a432414 4365 hdr->b_flags |= ARC_FLAG_PREFETCH;
34dc7c2f 4366 }
2a432414
GW
4367 if (*arc_flags & ARC_FLAG_L2CACHE)
4368 hdr->b_flags |= ARC_FLAG_L2CACHE;
4369 if (*arc_flags & ARC_FLAG_L2COMPRESS)
4370 hdr->b_flags |= ARC_FLAG_L2COMPRESS;
34dc7c2f 4371 if (BP_GET_LEVEL(bp) > 0)
2a432414 4372 hdr->b_flags |= ARC_FLAG_INDIRECT;
34dc7c2f 4373 } else {
b9541d6b
CW
4374 /*
4375 * This block is in the ghost cache. If it was L2-only
4376 * (and thus didn't have an L1 hdr), we realloc the
4377 * header to add an L1 hdr.
4378 */
4379 if (!HDR_HAS_L1HDR(hdr)) {
4380 hdr = arc_hdr_realloc(hdr, hdr_l2only_cache,
4381 hdr_full_cache);
4382 }
4383
4384 ASSERT(GHOST_STATE(hdr->b_l1hdr.b_state));
34dc7c2f 4385 ASSERT(!HDR_IO_IN_PROGRESS(hdr));
b9541d6b 4386 ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
ca0bf58d 4387 ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL);
34dc7c2f
BB
4388
4389 /* if this is a prefetch, we don't have a reference */
2a432414
GW
4390 if (*arc_flags & ARC_FLAG_PREFETCH)
4391 hdr->b_flags |= ARC_FLAG_PREFETCH;
34dc7c2f
BB
4392 else
4393 add_reference(hdr, hash_lock, private);
2a432414
GW
4394 if (*arc_flags & ARC_FLAG_L2CACHE)
4395 hdr->b_flags |= ARC_FLAG_L2CACHE;
4396 if (*arc_flags & ARC_FLAG_L2COMPRESS)
4397 hdr->b_flags |= ARC_FLAG_L2COMPRESS;
34dc7c2f
BB
4398 buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE);
4399 buf->b_hdr = hdr;
4400 buf->b_data = NULL;
4401 buf->b_efunc = NULL;
4402 buf->b_private = NULL;
4403 buf->b_next = NULL;
b9541d6b
CW
4404 hdr->b_l1hdr.b_buf = buf;
4405 ASSERT0(hdr->b_l1hdr.b_datacnt);
4406 hdr->b_l1hdr.b_datacnt = 1;
428870ff
BB
4407 arc_get_data_buf(buf);
4408 arc_access(hdr, hash_lock);
34dc7c2f
BB
4409 }
4410
b9541d6b 4411 ASSERT(!GHOST_STATE(hdr->b_l1hdr.b_state));
428870ff 4412
79c76d5b 4413 acb = kmem_zalloc(sizeof (arc_callback_t), KM_SLEEP);
34dc7c2f
BB
4414 acb->acb_done = done;
4415 acb->acb_private = private;
34dc7c2f 4416
b9541d6b
CW
4417 ASSERT(hdr->b_l1hdr.b_acb == NULL);
4418 hdr->b_l1hdr.b_acb = acb;
2a432414 4419 hdr->b_flags |= ARC_FLAG_IO_IN_PROGRESS;
34dc7c2f 4420
b9541d6b
CW
4421 if (HDR_HAS_L2HDR(hdr) &&
4422 (vd = hdr->b_l2hdr.b_dev->l2ad_vdev) != NULL) {
4423 devw = hdr->b_l2hdr.b_dev->l2ad_writing;
4424 addr = hdr->b_l2hdr.b_daddr;
4425 b_compress = HDR_GET_COMPRESS(hdr);
4426 b_asize = hdr->b_l2hdr.b_asize;
b128c09f
BB
4427 /*
4428 * Lock out device removal.
4429 */
4430 if (vdev_is_dead(vd) ||
4431 !spa_config_tryenter(spa, SCL_L2ARC, vd, RW_READER))
4432 vd = NULL;
4433 }
4434
9b67f605
MA
4435 if (hash_lock != NULL)
4436 mutex_exit(hash_lock);
b128c09f 4437
e49f1e20
WA
4438 /*
4439 * At this point, we have a level 1 cache miss. Try again in
4440 * L2ARC if possible.
4441 */
34dc7c2f 4442 ASSERT3U(hdr->b_size, ==, size);
428870ff 4443 DTRACE_PROBE4(arc__miss, arc_buf_hdr_t *, hdr, blkptr_t *, bp,
5dbd68a3 4444 uint64_t, size, zbookmark_phys_t *, zb);
34dc7c2f 4445 ARCSTAT_BUMP(arcstat_misses);
b9541d6b
CW
4446 ARCSTAT_CONDSTAT(!HDR_PREFETCH(hdr),
4447 demand, prefetch, !HDR_ISTYPE_METADATA(hdr),
34dc7c2f
BB
4448 data, metadata, misses);
4449
d164b209 4450 if (vd != NULL && l2arc_ndev != 0 && !(l2arc_norw && devw)) {
34dc7c2f
BB
4451 /*
4452 * Read from the L2ARC if the following are true:
b128c09f
BB
4453 * 1. The L2ARC vdev was previously cached.
4454 * 2. This buffer still has L2ARC metadata.
4455 * 3. This buffer isn't currently writing to the L2ARC.
4456 * 4. The L2ARC entry wasn't evicted, which may
4457 * also have invalidated the vdev.
d164b209 4458 * 5. This isn't prefetch and l2arc_noprefetch is set.
34dc7c2f 4459 */
b9541d6b 4460 if (HDR_HAS_L2HDR(hdr) &&
d164b209
BB
4461 !HDR_L2_WRITING(hdr) && !HDR_L2_EVICTED(hdr) &&
4462 !(l2arc_noprefetch && HDR_PREFETCH(hdr))) {
34dc7c2f
BB
4463 l2arc_read_callback_t *cb;
4464
4465 DTRACE_PROBE1(l2arc__hit, arc_buf_hdr_t *, hdr);
4466 ARCSTAT_BUMP(arcstat_l2_hits);
b9541d6b 4467 atomic_inc_32(&hdr->b_l2hdr.b_hits);
34dc7c2f 4468
34dc7c2f 4469 cb = kmem_zalloc(sizeof (l2arc_read_callback_t),
79c76d5b 4470 KM_SLEEP);
34dc7c2f
BB
4471 cb->l2rcb_buf = buf;
4472 cb->l2rcb_spa = spa;
4473 cb->l2rcb_bp = *bp;
4474 cb->l2rcb_zb = *zb;
b128c09f 4475 cb->l2rcb_flags = zio_flags;
0ed212dc 4476 cb->l2rcb_compress = b_compress;
34dc7c2f 4477
a117a6d6
GW
4478 ASSERT(addr >= VDEV_LABEL_START_SIZE &&
4479 addr + size < vd->vdev_psize -
4480 VDEV_LABEL_END_SIZE);
4481
34dc7c2f 4482 /*
b128c09f
BB
4483 * l2arc read. The SCL_L2ARC lock will be
4484 * released by l2arc_read_done().
3a17a7a9
SK
4485 * Issue a null zio if the underlying buffer
4486 * was squashed to zero size by compression.
34dc7c2f 4487 */
0ed212dc 4488 if (b_compress == ZIO_COMPRESS_EMPTY) {
3a17a7a9
SK
4489 rzio = zio_null(pio, spa, vd,
4490 l2arc_read_done, cb,
4491 zio_flags | ZIO_FLAG_DONT_CACHE |
4492 ZIO_FLAG_CANFAIL |
4493 ZIO_FLAG_DONT_PROPAGATE |
4494 ZIO_FLAG_DONT_RETRY);
4495 } else {
4496 rzio = zio_read_phys(pio, vd, addr,
0ed212dc
BP
4497 b_asize, buf->b_data,
4498 ZIO_CHECKSUM_OFF,
3a17a7a9
SK
4499 l2arc_read_done, cb, priority,
4500 zio_flags | ZIO_FLAG_DONT_CACHE |
4501 ZIO_FLAG_CANFAIL |
4502 ZIO_FLAG_DONT_PROPAGATE |
4503 ZIO_FLAG_DONT_RETRY, B_FALSE);
4504 }
34dc7c2f
BB
4505 DTRACE_PROBE2(l2arc__read, vdev_t *, vd,
4506 zio_t *, rzio);
0ed212dc 4507 ARCSTAT_INCR(arcstat_l2_read_bytes, b_asize);
34dc7c2f 4508
2a432414 4509 if (*arc_flags & ARC_FLAG_NOWAIT) {
b128c09f 4510 zio_nowait(rzio);
1421c891 4511 goto out;
b128c09f 4512 }
34dc7c2f 4513
2a432414 4514 ASSERT(*arc_flags & ARC_FLAG_WAIT);
b128c09f 4515 if (zio_wait(rzio) == 0)
1421c891 4516 goto out;
b128c09f
BB
4517
4518 /* l2arc read error; goto zio_read() */
34dc7c2f
BB
4519 } else {
4520 DTRACE_PROBE1(l2arc__miss,
4521 arc_buf_hdr_t *, hdr);
4522 ARCSTAT_BUMP(arcstat_l2_misses);
4523 if (HDR_L2_WRITING(hdr))
4524 ARCSTAT_BUMP(arcstat_l2_rw_clash);
b128c09f 4525 spa_config_exit(spa, SCL_L2ARC, vd);
34dc7c2f 4526 }
d164b209
BB
4527 } else {
4528 if (vd != NULL)
4529 spa_config_exit(spa, SCL_L2ARC, vd);
4530 if (l2arc_ndev != 0) {
4531 DTRACE_PROBE1(l2arc__miss,
4532 arc_buf_hdr_t *, hdr);
4533 ARCSTAT_BUMP(arcstat_l2_misses);
4534 }
34dc7c2f 4535 }
34dc7c2f
BB
4536
4537 rzio = zio_read(pio, spa, bp, buf->b_data, size,
b128c09f 4538 arc_read_done, buf, priority, zio_flags, zb);
34dc7c2f 4539
2a432414 4540 if (*arc_flags & ARC_FLAG_WAIT) {
1421c891
PS
4541 rc = zio_wait(rzio);
4542 goto out;
4543 }
34dc7c2f 4544
2a432414 4545 ASSERT(*arc_flags & ARC_FLAG_NOWAIT);
34dc7c2f
BB
4546 zio_nowait(rzio);
4547 }
1421c891
PS
4548
4549out:
4550 spa_read_history_add(spa, zb, *arc_flags);
4551 return (rc);
34dc7c2f
BB
4552}
4553
ab26409d
BB
4554arc_prune_t *
4555arc_add_prune_callback(arc_prune_func_t *func, void *private)
4556{
4557 arc_prune_t *p;
4558
d1d7e268 4559 p = kmem_alloc(sizeof (*p), KM_SLEEP);
ab26409d
BB
4560 p->p_pfunc = func;
4561 p->p_private = private;
4562 list_link_init(&p->p_node);
4563 refcount_create(&p->p_refcnt);
4564
4565 mutex_enter(&arc_prune_mtx);
4566 refcount_add(&p->p_refcnt, &arc_prune_list);
4567 list_insert_head(&arc_prune_list, p);
4568 mutex_exit(&arc_prune_mtx);
4569
4570 return (p);
4571}
4572
4573void
4574arc_remove_prune_callback(arc_prune_t *p)
4575{
4576 mutex_enter(&arc_prune_mtx);
4577 list_remove(&arc_prune_list, p);
4578 if (refcount_remove(&p->p_refcnt, &arc_prune_list) == 0) {
4579 refcount_destroy(&p->p_refcnt);
4580 kmem_free(p, sizeof (*p));
4581 }
4582 mutex_exit(&arc_prune_mtx);
4583}
4584
34dc7c2f
BB
4585void
4586arc_set_callback(arc_buf_t *buf, arc_evict_func_t *func, void *private)
4587{
4588 ASSERT(buf->b_hdr != NULL);
b9541d6b
CW
4589 ASSERT(buf->b_hdr->b_l1hdr.b_state != arc_anon);
4590 ASSERT(!refcount_is_zero(&buf->b_hdr->b_l1hdr.b_refcnt) ||
4591 func == NULL);
428870ff
BB
4592 ASSERT(buf->b_efunc == NULL);
4593 ASSERT(!HDR_BUF_AVAILABLE(buf->b_hdr));
4594
34dc7c2f
BB
4595 buf->b_efunc = func;
4596 buf->b_private = private;
4597}
4598
df4474f9
MA
4599/*
4600 * Notify the arc that a block was freed, and thus will never be used again.
4601 */
4602void
4603arc_freed(spa_t *spa, const blkptr_t *bp)
4604{
4605 arc_buf_hdr_t *hdr;
4606 kmutex_t *hash_lock;
4607 uint64_t guid = spa_load_guid(spa);
4608
9b67f605
MA
4609 ASSERT(!BP_IS_EMBEDDED(bp));
4610
4611 hdr = buf_hash_find(guid, bp, &hash_lock);
df4474f9
MA
4612 if (hdr == NULL)
4613 return;
4614 if (HDR_BUF_AVAILABLE(hdr)) {
b9541d6b 4615 arc_buf_t *buf = hdr->b_l1hdr.b_buf;
df4474f9 4616 add_reference(hdr, hash_lock, FTAG);
2a432414 4617 hdr->b_flags &= ~ARC_FLAG_BUF_AVAILABLE;
df4474f9
MA
4618 mutex_exit(hash_lock);
4619
4620 arc_release(buf, FTAG);
4621 (void) arc_buf_remove_ref(buf, FTAG);
4622 } else {
4623 mutex_exit(hash_lock);
4624 }
4625
4626}
4627
34dc7c2f 4628/*
bd089c54
MA
4629 * Clear the user eviction callback set by arc_set_callback(), first calling
4630 * it if it exists. Because the presence of a callback keeps an arc_buf cached
4631 * clearing the callback may result in the arc_buf being destroyed. However,
4632 * it will not result in the *last* arc_buf being destroyed, hence the data
4633 * will remain cached in the ARC. We make a copy of the arc buffer here so
4634 * that we can process the callback without holding any locks.
4635 *
4636 * It's possible that the callback is already in the process of being cleared
4637 * by another thread. In this case we can not clear the callback.
4638 *
4639 * Returns B_TRUE if the callback was successfully called and cleared.
34dc7c2f 4640 */
bd089c54
MA
4641boolean_t
4642arc_clear_callback(arc_buf_t *buf)
34dc7c2f
BB
4643{
4644 arc_buf_hdr_t *hdr;
4645 kmutex_t *hash_lock;
bd089c54
MA
4646 arc_evict_func_t *efunc = buf->b_efunc;
4647 void *private = buf->b_private;
34dc7c2f 4648
428870ff 4649 mutex_enter(&buf->b_evict_lock);
34dc7c2f
BB
4650 hdr = buf->b_hdr;
4651 if (hdr == NULL) {
4652 /*
4653 * We are in arc_do_user_evicts().
4654 */
4655 ASSERT(buf->b_data == NULL);
428870ff 4656 mutex_exit(&buf->b_evict_lock);
bd089c54 4657 return (B_FALSE);
b128c09f 4658 } else if (buf->b_data == NULL) {
34dc7c2f 4659 /*
b128c09f
BB
4660 * We are on the eviction list; process this buffer now
4661 * but let arc_do_user_evicts() do the reaping.
34dc7c2f 4662 */
b128c09f 4663 buf->b_efunc = NULL;
428870ff 4664 mutex_exit(&buf->b_evict_lock);
bd089c54
MA
4665 VERIFY0(efunc(private));
4666 return (B_TRUE);
34dc7c2f 4667 }
b128c09f
BB
4668 hash_lock = HDR_LOCK(hdr);
4669 mutex_enter(hash_lock);
428870ff
BB
4670 hdr = buf->b_hdr;
4671 ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
34dc7c2f 4672
b9541d6b
CW
4673 ASSERT3U(refcount_count(&hdr->b_l1hdr.b_refcnt), <,
4674 hdr->b_l1hdr.b_datacnt);
4675 ASSERT(hdr->b_l1hdr.b_state == arc_mru ||
4676 hdr->b_l1hdr.b_state == arc_mfu);
34dc7c2f 4677
bd089c54
MA
4678 buf->b_efunc = NULL;
4679 buf->b_private = NULL;
34dc7c2f 4680
b9541d6b 4681 if (hdr->b_l1hdr.b_datacnt > 1) {
bd089c54 4682 mutex_exit(&buf->b_evict_lock);
ca0bf58d 4683 arc_buf_destroy(buf, TRUE);
bd089c54 4684 } else {
b9541d6b 4685 ASSERT(buf == hdr->b_l1hdr.b_buf);
2a432414 4686 hdr->b_flags |= ARC_FLAG_BUF_AVAILABLE;
bd089c54 4687 mutex_exit(&buf->b_evict_lock);
34dc7c2f 4688 }
34dc7c2f 4689
bd089c54
MA
4690 mutex_exit(hash_lock);
4691 VERIFY0(efunc(private));
4692 return (B_TRUE);
34dc7c2f
BB
4693}
4694
4695/*
e49f1e20
WA
4696 * Release this buffer from the cache, making it an anonymous buffer. This
4697 * must be done after a read and prior to modifying the buffer contents.
34dc7c2f 4698 * If the buffer has more than one reference, we must make
b128c09f 4699 * a new hdr for the buffer.
34dc7c2f
BB
4700 */
4701void
4702arc_release(arc_buf_t *buf, void *tag)
4703{
b9541d6b
CW
4704 kmutex_t *hash_lock;
4705 arc_state_t *state;
4706 arc_buf_hdr_t *hdr = buf->b_hdr;
34dc7c2f 4707
428870ff 4708 /*
ca0bf58d 4709 * It would be nice to assert that if its DMU metadata (level >
428870ff
BB
4710 * 0 || it's the dnode file), then it must be syncing context.
4711 * But we don't know that information at this level.
4712 */
4713
4714 mutex_enter(&buf->b_evict_lock);
b128c09f 4715
ca0bf58d
PS
4716 ASSERT(HDR_HAS_L1HDR(hdr));
4717
b9541d6b
CW
4718 /*
4719 * We don't grab the hash lock prior to this check, because if
4720 * the buffer's header is in the arc_anon state, it won't be
4721 * linked into the hash table.
4722 */
4723 if (hdr->b_l1hdr.b_state == arc_anon) {
4724 mutex_exit(&buf->b_evict_lock);
4725 ASSERT(!HDR_IO_IN_PROGRESS(hdr));
4726 ASSERT(!HDR_IN_HASH_TABLE(hdr));
4727 ASSERT(!HDR_HAS_L2HDR(hdr));
4728 ASSERT(BUF_EMPTY(hdr));
34dc7c2f 4729
b9541d6b
CW
4730 ASSERT3U(hdr->b_l1hdr.b_datacnt, ==, 1);
4731 ASSERT3S(refcount_count(&hdr->b_l1hdr.b_refcnt), ==, 1);
4732 ASSERT(!list_link_active(&hdr->b_l1hdr.b_arc_node));
4733
4734 ASSERT3P(buf->b_efunc, ==, NULL);
4735 ASSERT3P(buf->b_private, ==, NULL);
4736
4737 hdr->b_l1hdr.b_arc_access = 0;
4738 arc_buf_thaw(buf);
4739
4740 return;
34dc7c2f
BB
4741 }
4742
b9541d6b
CW
4743 hash_lock = HDR_LOCK(hdr);
4744 mutex_enter(hash_lock);
4745
4746 /*
4747 * This assignment is only valid as long as the hash_lock is
4748 * held, we must be careful not to reference state or the
4749 * b_state field after dropping the lock.
4750 */
4751 state = hdr->b_l1hdr.b_state;
4752 ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
4753 ASSERT3P(state, !=, arc_anon);
4754
4755 /* this buffer is not on any list */
4756 ASSERT(refcount_count(&hdr->b_l1hdr.b_refcnt) > 0);
4757
4758 if (HDR_HAS_L2HDR(hdr)) {
b9541d6b 4759 mutex_enter(&hdr->b_l2hdr.b_dev->l2ad_mtx);
ca0bf58d
PS
4760
4761 /*
d962d5da
PS
4762 * We have to recheck this conditional again now that
4763 * we're holding the l2ad_mtx to prevent a race with
4764 * another thread which might be concurrently calling
4765 * l2arc_evict(). In that case, l2arc_evict() might have
4766 * destroyed the header's L2 portion as we were waiting
4767 * to acquire the l2ad_mtx.
ca0bf58d 4768 */
d962d5da
PS
4769 if (HDR_HAS_L2HDR(hdr))
4770 arc_hdr_l2hdr_destroy(hdr);
ca0bf58d 4771
b9541d6b 4772 mutex_exit(&hdr->b_l2hdr.b_dev->l2ad_mtx);
b128c09f
BB
4773 }
4774
34dc7c2f
BB
4775 /*
4776 * Do we have more than one buf?
4777 */
b9541d6b 4778 if (hdr->b_l1hdr.b_datacnt > 1) {
34dc7c2f
BB
4779 arc_buf_hdr_t *nhdr;
4780 arc_buf_t **bufp;
4781 uint64_t blksz = hdr->b_size;
d164b209 4782 uint64_t spa = hdr->b_spa;
b9541d6b 4783 arc_buf_contents_t type = arc_buf_type(hdr);
34dc7c2f
BB
4784 uint32_t flags = hdr->b_flags;
4785
b9541d6b 4786 ASSERT(hdr->b_l1hdr.b_buf != buf || buf->b_next != NULL);
34dc7c2f 4787 /*
428870ff
BB
4788 * Pull the data off of this hdr and attach it to
4789 * a new anonymous hdr.
34dc7c2f
BB
4790 */
4791 (void) remove_reference(hdr, hash_lock, tag);
b9541d6b 4792 bufp = &hdr->b_l1hdr.b_buf;
34dc7c2f
BB
4793 while (*bufp != buf)
4794 bufp = &(*bufp)->b_next;
428870ff 4795 *bufp = buf->b_next;
34dc7c2f
BB
4796 buf->b_next = NULL;
4797
b9541d6b 4798 ASSERT3P(state, !=, arc_l2c_only);
36da08ef
PS
4799
4800 (void) refcount_remove_many(
4801 &state->arcs_size, hdr->b_size, buf);
4802
b9541d6b
CW
4803 if (refcount_is_zero(&hdr->b_l1hdr.b_refcnt)) {
4804 uint64_t *size;
4805
4806 ASSERT3P(state, !=, arc_l2c_only);
4807 size = &state->arcs_lsize[type];
34dc7c2f
BB
4808 ASSERT3U(*size, >=, hdr->b_size);
4809 atomic_add_64(size, -hdr->b_size);
4810 }
1eb5bfa3
GW
4811
4812 /*
4813 * We're releasing a duplicate user data buffer, update
4814 * our statistics accordingly.
4815 */
b9541d6b 4816 if (HDR_ISTYPE_DATA(hdr)) {
1eb5bfa3
GW
4817 ARCSTAT_BUMPDOWN(arcstat_duplicate_buffers);
4818 ARCSTAT_INCR(arcstat_duplicate_buffers_size,
4819 -hdr->b_size);
4820 }
b9541d6b 4821 hdr->b_l1hdr.b_datacnt -= 1;
34dc7c2f 4822 arc_cksum_verify(buf);
498877ba 4823 arc_buf_unwatch(buf);
34dc7c2f
BB
4824
4825 mutex_exit(hash_lock);
4826
b9541d6b 4827 nhdr = kmem_cache_alloc(hdr_full_cache, KM_PUSHPAGE);
34dc7c2f
BB
4828 nhdr->b_size = blksz;
4829 nhdr->b_spa = spa;
b9541d6b
CW
4830
4831 nhdr->b_l1hdr.b_mru_hits = 0;
4832 nhdr->b_l1hdr.b_mru_ghost_hits = 0;
4833 nhdr->b_l1hdr.b_mfu_hits = 0;
4834 nhdr->b_l1hdr.b_mfu_ghost_hits = 0;
4835 nhdr->b_l1hdr.b_l2_hits = 0;
2a432414 4836 nhdr->b_flags = flags & ARC_FLAG_L2_WRITING;
b9541d6b
CW
4837 nhdr->b_flags |= arc_bufc_to_flags(type);
4838 nhdr->b_flags |= ARC_FLAG_HAS_L1HDR;
4839
4840 nhdr->b_l1hdr.b_buf = buf;
4841 nhdr->b_l1hdr.b_datacnt = 1;
4842 nhdr->b_l1hdr.b_state = arc_anon;
4843 nhdr->b_l1hdr.b_arc_access = 0;
ca0bf58d 4844 nhdr->b_l1hdr.b_tmp_cdata = NULL;
34dc7c2f 4845 nhdr->b_freeze_cksum = NULL;
b9541d6b
CW
4846
4847 (void) refcount_add(&nhdr->b_l1hdr.b_refcnt, tag);
34dc7c2f 4848 buf->b_hdr = nhdr;
428870ff 4849 mutex_exit(&buf->b_evict_lock);
36da08ef 4850 (void) refcount_add_many(&arc_anon->arcs_size, blksz, buf);
34dc7c2f 4851 } else {
428870ff 4852 mutex_exit(&buf->b_evict_lock);
b9541d6b 4853 ASSERT(refcount_count(&hdr->b_l1hdr.b_refcnt) == 1);
ca0bf58d
PS
4854 /* protected by hash lock, or hdr is on arc_anon */
4855 ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node));
34dc7c2f 4856 ASSERT(!HDR_IO_IN_PROGRESS(hdr));
b9541d6b
CW
4857 hdr->b_l1hdr.b_mru_hits = 0;
4858 hdr->b_l1hdr.b_mru_ghost_hits = 0;
4859 hdr->b_l1hdr.b_mfu_hits = 0;
4860 hdr->b_l1hdr.b_mfu_ghost_hits = 0;
4861 hdr->b_l1hdr.b_l2_hits = 0;
4862 arc_change_state(arc_anon, hdr, hash_lock);
4863 hdr->b_l1hdr.b_arc_access = 0;
4864 mutex_exit(hash_lock);
34dc7c2f 4865
428870ff 4866 buf_discard_identity(hdr);
34dc7c2f
BB
4867 arc_buf_thaw(buf);
4868 }
4869 buf->b_efunc = NULL;
4870 buf->b_private = NULL;
34dc7c2f
BB
4871}
4872
4873int
4874arc_released(arc_buf_t *buf)
4875{
b128c09f
BB
4876 int released;
4877
428870ff 4878 mutex_enter(&buf->b_evict_lock);
b9541d6b
CW
4879 released = (buf->b_data != NULL &&
4880 buf->b_hdr->b_l1hdr.b_state == arc_anon);
428870ff 4881 mutex_exit(&buf->b_evict_lock);
b128c09f 4882 return (released);
34dc7c2f
BB
4883}
4884
34dc7c2f
BB
4885#ifdef ZFS_DEBUG
4886int
4887arc_referenced(arc_buf_t *buf)
4888{
b128c09f
BB
4889 int referenced;
4890
428870ff 4891 mutex_enter(&buf->b_evict_lock);
b9541d6b 4892 referenced = (refcount_count(&buf->b_hdr->b_l1hdr.b_refcnt));
428870ff 4893 mutex_exit(&buf->b_evict_lock);
b128c09f 4894 return (referenced);
34dc7c2f
BB
4895}
4896#endif
4897
4898static void
4899arc_write_ready(zio_t *zio)
4900{
4901 arc_write_callback_t *callback = zio->io_private;
4902 arc_buf_t *buf = callback->awcb_buf;
4903 arc_buf_hdr_t *hdr = buf->b_hdr;
4904
b9541d6b
CW
4905 ASSERT(HDR_HAS_L1HDR(hdr));
4906 ASSERT(!refcount_is_zero(&buf->b_hdr->b_l1hdr.b_refcnt));
4907 ASSERT(hdr->b_l1hdr.b_datacnt > 0);
b128c09f
BB
4908 callback->awcb_ready(zio, buf, callback->awcb_private);
4909
34dc7c2f
BB
4910 /*
4911 * If the IO is already in progress, then this is a re-write
b128c09f
BB
4912 * attempt, so we need to thaw and re-compute the cksum.
4913 * It is the responsibility of the callback to handle the
4914 * accounting for any re-write attempt.
34dc7c2f
BB
4915 */
4916 if (HDR_IO_IN_PROGRESS(hdr)) {
b9541d6b 4917 mutex_enter(&hdr->b_l1hdr.b_freeze_lock);
34dc7c2f
BB
4918 if (hdr->b_freeze_cksum != NULL) {
4919 kmem_free(hdr->b_freeze_cksum, sizeof (zio_cksum_t));
4920 hdr->b_freeze_cksum = NULL;
4921 }
b9541d6b 4922 mutex_exit(&hdr->b_l1hdr.b_freeze_lock);
34dc7c2f
BB
4923 }
4924 arc_cksum_compute(buf, B_FALSE);
2a432414 4925 hdr->b_flags |= ARC_FLAG_IO_IN_PROGRESS;
34dc7c2f
BB
4926}
4927
e8b96c60
MA
4928/*
4929 * The SPA calls this callback for each physical write that happens on behalf
4930 * of a logical write. See the comment in dbuf_write_physdone() for details.
4931 */
4932static void
4933arc_write_physdone(zio_t *zio)
4934{
4935 arc_write_callback_t *cb = zio->io_private;
4936 if (cb->awcb_physdone != NULL)
4937 cb->awcb_physdone(zio, cb->awcb_buf, cb->awcb_private);
4938}
4939
34dc7c2f
BB
4940static void
4941arc_write_done(zio_t *zio)
4942{
4943 arc_write_callback_t *callback = zio->io_private;
4944 arc_buf_t *buf = callback->awcb_buf;
4945 arc_buf_hdr_t *hdr = buf->b_hdr;
4946
b9541d6b 4947 ASSERT(hdr->b_l1hdr.b_acb == NULL);
428870ff
BB
4948
4949 if (zio->io_error == 0) {
9b67f605 4950 if (BP_IS_HOLE(zio->io_bp) || BP_IS_EMBEDDED(zio->io_bp)) {
b0bc7a84
MG
4951 buf_discard_identity(hdr);
4952 } else {
4953 hdr->b_dva = *BP_IDENTITY(zio->io_bp);
4954 hdr->b_birth = BP_PHYSICAL_BIRTH(zio->io_bp);
b0bc7a84 4955 }
428870ff
BB
4956 } else {
4957 ASSERT(BUF_EMPTY(hdr));
4958 }
34dc7c2f 4959
34dc7c2f 4960 /*
9b67f605
MA
4961 * If the block to be written was all-zero or compressed enough to be
4962 * embedded in the BP, no write was performed so there will be no
4963 * dva/birth/checksum. The buffer must therefore remain anonymous
4964 * (and uncached).
34dc7c2f
BB
4965 */
4966 if (!BUF_EMPTY(hdr)) {
4967 arc_buf_hdr_t *exists;
4968 kmutex_t *hash_lock;
4969
428870ff
BB
4970 ASSERT(zio->io_error == 0);
4971
34dc7c2f
BB
4972 arc_cksum_verify(buf);
4973
4974 exists = buf_hash_insert(hdr, &hash_lock);
b9541d6b 4975 if (exists != NULL) {
34dc7c2f
BB
4976 /*
4977 * This can only happen if we overwrite for
4978 * sync-to-convergence, because we remove
4979 * buffers from the hash table when we arc_free().
4980 */
428870ff
BB
4981 if (zio->io_flags & ZIO_FLAG_IO_REWRITE) {
4982 if (!BP_EQUAL(&zio->io_bp_orig, zio->io_bp))
4983 panic("bad overwrite, hdr=%p exists=%p",
4984 (void *)hdr, (void *)exists);
b9541d6b
CW
4985 ASSERT(refcount_is_zero(
4986 &exists->b_l1hdr.b_refcnt));
428870ff
BB
4987 arc_change_state(arc_anon, exists, hash_lock);
4988 mutex_exit(hash_lock);
4989 arc_hdr_destroy(exists);
4990 exists = buf_hash_insert(hdr, &hash_lock);
4991 ASSERT3P(exists, ==, NULL);
03c6040b
GW
4992 } else if (zio->io_flags & ZIO_FLAG_NOPWRITE) {
4993 /* nopwrite */
4994 ASSERT(zio->io_prop.zp_nopwrite);
4995 if (!BP_EQUAL(&zio->io_bp_orig, zio->io_bp))
4996 panic("bad nopwrite, hdr=%p exists=%p",
4997 (void *)hdr, (void *)exists);
428870ff
BB
4998 } else {
4999 /* Dedup */
b9541d6b
CW
5000 ASSERT(hdr->b_l1hdr.b_datacnt == 1);
5001 ASSERT(hdr->b_l1hdr.b_state == arc_anon);
428870ff
BB
5002 ASSERT(BP_GET_DEDUP(zio->io_bp));
5003 ASSERT(BP_GET_LEVEL(zio->io_bp) == 0);
5004 }
34dc7c2f 5005 }
2a432414 5006 hdr->b_flags &= ~ARC_FLAG_IO_IN_PROGRESS;
b128c09f 5007 /* if it's not anon, we are doing a scrub */
b9541d6b 5008 if (exists == NULL && hdr->b_l1hdr.b_state == arc_anon)
b128c09f 5009 arc_access(hdr, hash_lock);
34dc7c2f 5010 mutex_exit(hash_lock);
34dc7c2f 5011 } else {
2a432414 5012 hdr->b_flags &= ~ARC_FLAG_IO_IN_PROGRESS;
34dc7c2f
BB
5013 }
5014
b9541d6b 5015 ASSERT(!refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
428870ff 5016 callback->awcb_done(zio, buf, callback->awcb_private);
34dc7c2f
BB
5017
5018 kmem_free(callback, sizeof (arc_write_callback_t));
5019}
5020
5021zio_t *
428870ff 5022arc_write(zio_t *pio, spa_t *spa, uint64_t txg,
3a17a7a9 5023 blkptr_t *bp, arc_buf_t *buf, boolean_t l2arc, boolean_t l2arc_compress,
e8b96c60
MA
5024 const zio_prop_t *zp, arc_done_func_t *ready, arc_done_func_t *physdone,
5025 arc_done_func_t *done, void *private, zio_priority_t priority,
5dbd68a3 5026 int zio_flags, const zbookmark_phys_t *zb)
34dc7c2f
BB
5027{
5028 arc_buf_hdr_t *hdr = buf->b_hdr;
5029 arc_write_callback_t *callback;
b128c09f 5030 zio_t *zio;
34dc7c2f 5031
b128c09f 5032 ASSERT(ready != NULL);
428870ff 5033 ASSERT(done != NULL);
34dc7c2f 5034 ASSERT(!HDR_IO_ERROR(hdr));
b9541d6b
CW
5035 ASSERT(!HDR_IO_IN_PROGRESS(hdr));
5036 ASSERT(hdr->b_l1hdr.b_acb == NULL);
5037 ASSERT(hdr->b_l1hdr.b_datacnt > 0);
b128c09f 5038 if (l2arc)
2a432414 5039 hdr->b_flags |= ARC_FLAG_L2CACHE;
3a17a7a9 5040 if (l2arc_compress)
2a432414 5041 hdr->b_flags |= ARC_FLAG_L2COMPRESS;
79c76d5b 5042 callback = kmem_zalloc(sizeof (arc_write_callback_t), KM_SLEEP);
34dc7c2f 5043 callback->awcb_ready = ready;
e8b96c60 5044 callback->awcb_physdone = physdone;
34dc7c2f
BB
5045 callback->awcb_done = done;
5046 callback->awcb_private = private;
5047 callback->awcb_buf = buf;
b128c09f 5048
428870ff 5049 zio = zio_write(pio, spa, txg, bp, buf->b_data, hdr->b_size, zp,
e8b96c60
MA
5050 arc_write_ready, arc_write_physdone, arc_write_done, callback,
5051 priority, zio_flags, zb);
34dc7c2f
BB
5052
5053 return (zio);
5054}
5055
34dc7c2f 5056static int
e8b96c60 5057arc_memory_throttle(uint64_t reserve, uint64_t txg)
34dc7c2f
BB
5058{
5059#ifdef _KERNEL
0c5493d4
BB
5060 if (zfs_arc_memory_throttle_disable)
5061 return (0);
5062
ca67b33a
MA
5063 if (freemem > physmem * arc_lotsfree_percent / 100)
5064 return (0);
5065
5066 if (arc_reclaim_needed()) {
5067 /* memory is low, delay before restarting */
34dc7c2f 5068 ARCSTAT_INCR(arcstat_memory_throttle_count, 1);
570827e1 5069 DMU_TX_STAT_BUMP(dmu_tx_memory_reclaim);
2e528b49 5070 return (SET_ERROR(EAGAIN));
34dc7c2f 5071 }
34dc7c2f
BB
5072#endif
5073 return (0);
5074}
5075
5076void
5077arc_tempreserve_clear(uint64_t reserve)
5078{
5079 atomic_add_64(&arc_tempreserve, -reserve);
5080 ASSERT((int64_t)arc_tempreserve >= 0);
5081}
5082
5083int
5084arc_tempreserve_space(uint64_t reserve, uint64_t txg)
5085{
5086 int error;
9babb374 5087 uint64_t anon_size;
34dc7c2f 5088
34dc7c2f
BB
5089 if (reserve > arc_c/4 && !arc_no_grow)
5090 arc_c = MIN(arc_c_max, reserve * 4);
12f9a6a3
BB
5091
5092 /*
5093 * Throttle when the calculated memory footprint for the TXG
5094 * exceeds the target ARC size.
5095 */
570827e1
BB
5096 if (reserve > arc_c) {
5097 DMU_TX_STAT_BUMP(dmu_tx_memory_reserve);
12f9a6a3 5098 return (SET_ERROR(ERESTART));
570827e1 5099 }
34dc7c2f 5100
9babb374
BB
5101 /*
5102 * Don't count loaned bufs as in flight dirty data to prevent long
5103 * network delays from blocking transactions that are ready to be
5104 * assigned to a txg.
5105 */
36da08ef
PS
5106 anon_size = MAX((int64_t)(refcount_count(&arc_anon->arcs_size) -
5107 arc_loaned_bytes), 0);
9babb374 5108
34dc7c2f
BB
5109 /*
5110 * Writes will, almost always, require additional memory allocations
d3cc8b15 5111 * in order to compress/encrypt/etc the data. We therefore need to
34dc7c2f
BB
5112 * make sure that there is sufficient available memory for this.
5113 */
e8b96c60
MA
5114 error = arc_memory_throttle(reserve, txg);
5115 if (error != 0)
34dc7c2f
BB
5116 return (error);
5117
5118 /*
5119 * Throttle writes when the amount of dirty data in the cache
5120 * gets too large. We try to keep the cache less than half full
5121 * of dirty blocks so that our sync times don't grow too large.
5122 * Note: if two requests come in concurrently, we might let them
5123 * both succeed, when one of them should fail. Not a huge deal.
5124 */
9babb374
BB
5125
5126 if (reserve + arc_tempreserve + anon_size > arc_c / 2 &&
5127 anon_size > arc_c / 4) {
34dc7c2f
BB
5128 dprintf("failing, arc_tempreserve=%lluK anon_meta=%lluK "
5129 "anon_data=%lluK tempreserve=%lluK arc_c=%lluK\n",
5130 arc_tempreserve>>10,
5131 arc_anon->arcs_lsize[ARC_BUFC_METADATA]>>10,
5132 arc_anon->arcs_lsize[ARC_BUFC_DATA]>>10,
5133 reserve>>10, arc_c>>10);
570827e1 5134 DMU_TX_STAT_BUMP(dmu_tx_dirty_throttle);
2e528b49 5135 return (SET_ERROR(ERESTART));
34dc7c2f
BB
5136 }
5137 atomic_add_64(&arc_tempreserve, reserve);
5138 return (0);
5139}
5140
13be560d
BB
5141static void
5142arc_kstat_update_state(arc_state_t *state, kstat_named_t *size,
5143 kstat_named_t *evict_data, kstat_named_t *evict_metadata)
5144{
36da08ef 5145 size->value.ui64 = refcount_count(&state->arcs_size);
13be560d
BB
5146 evict_data->value.ui64 = state->arcs_lsize[ARC_BUFC_DATA];
5147 evict_metadata->value.ui64 = state->arcs_lsize[ARC_BUFC_METADATA];
5148}
5149
5150static int
5151arc_kstat_update(kstat_t *ksp, int rw)
5152{
5153 arc_stats_t *as = ksp->ks_data;
5154
5155 if (rw == KSTAT_WRITE) {
500445c0 5156 return (EACCES);
13be560d
BB
5157 } else {
5158 arc_kstat_update_state(arc_anon,
5159 &as->arcstat_anon_size,
500445c0
PS
5160 &as->arcstat_anon_evictable_data,
5161 &as->arcstat_anon_evictable_metadata);
13be560d
BB
5162 arc_kstat_update_state(arc_mru,
5163 &as->arcstat_mru_size,
500445c0
PS
5164 &as->arcstat_mru_evictable_data,
5165 &as->arcstat_mru_evictable_metadata);
13be560d
BB
5166 arc_kstat_update_state(arc_mru_ghost,
5167 &as->arcstat_mru_ghost_size,
500445c0
PS
5168 &as->arcstat_mru_ghost_evictable_data,
5169 &as->arcstat_mru_ghost_evictable_metadata);
13be560d
BB
5170 arc_kstat_update_state(arc_mfu,
5171 &as->arcstat_mfu_size,
500445c0
PS
5172 &as->arcstat_mfu_evictable_data,
5173 &as->arcstat_mfu_evictable_metadata);
fc41c640 5174 arc_kstat_update_state(arc_mfu_ghost,
13be560d 5175 &as->arcstat_mfu_ghost_size,
500445c0
PS
5176 &as->arcstat_mfu_ghost_evictable_data,
5177 &as->arcstat_mfu_ghost_evictable_metadata);
13be560d
BB
5178 }
5179
5180 return (0);
5181}
5182
ca0bf58d
PS
5183/*
5184 * This function *must* return indices evenly distributed between all
5185 * sublists of the multilist. This is needed due to how the ARC eviction
5186 * code is laid out; arc_evict_state() assumes ARC buffers are evenly
5187 * distributed between all sublists and uses this assumption when
5188 * deciding which sublist to evict from and how much to evict from it.
5189 */
5190unsigned int
5191arc_state_multilist_index_func(multilist_t *ml, void *obj)
5192{
5193 arc_buf_hdr_t *hdr = obj;
5194
5195 /*
5196 * We rely on b_dva to generate evenly distributed index
5197 * numbers using buf_hash below. So, as an added precaution,
5198 * let's make sure we never add empty buffers to the arc lists.
5199 */
5200 ASSERT(!BUF_EMPTY(hdr));
5201
5202 /*
5203 * The assumption here, is the hash value for a given
5204 * arc_buf_hdr_t will remain constant throughout its lifetime
5205 * (i.e. its b_spa, b_dva, and b_birth fields don't change).
5206 * Thus, we don't need to store the header's sublist index
5207 * on insertion, as this index can be recalculated on removal.
5208 *
5209 * Also, the low order bits of the hash value are thought to be
5210 * distributed evenly. Otherwise, in the case that the multilist
5211 * has a power of two number of sublists, each sublists' usage
5212 * would not be evenly distributed.
5213 */
5214 return (buf_hash(hdr->b_spa, &hdr->b_dva, hdr->b_birth) %
5215 multilist_get_num_sublists(ml));
5216}
5217
ca67b33a
MA
5218/*
5219 * Called during module initialization and periodically thereafter to
5220 * apply reasonable changes to the exposed performance tunings. Non-zero
5221 * zfs_* values which differ from the currently set values will be applied.
5222 */
5223static void
5224arc_tuning_update(void)
5225{
5226 /* Valid range: 64M - <all physical memory> */
5227 if ((zfs_arc_max) && (zfs_arc_max != arc_c_max) &&
5228 (zfs_arc_max > 64 << 20) && (zfs_arc_max < ptob(physmem)) &&
5229 (zfs_arc_max > arc_c_min)) {
5230 arc_c_max = zfs_arc_max;
5231 arc_c = arc_c_max;
5232 arc_p = (arc_c >> 1);
5233 arc_meta_limit = MIN(arc_meta_limit, arc_c_max);
5234 }
5235
5236 /* Valid range: 32M - <arc_c_max> */
5237 if ((zfs_arc_min) && (zfs_arc_min != arc_c_min) &&
5238 (zfs_arc_min >= 2ULL << SPA_MAXBLOCKSHIFT) &&
5239 (zfs_arc_min <= arc_c_max)) {
5240 arc_c_min = zfs_arc_min;
5241 arc_c = MAX(arc_c, arc_c_min);
5242 }
5243
5244 /* Valid range: 16M - <arc_c_max> */
5245 if ((zfs_arc_meta_min) && (zfs_arc_meta_min != arc_meta_min) &&
5246 (zfs_arc_meta_min >= 1ULL << SPA_MAXBLOCKSHIFT) &&
5247 (zfs_arc_meta_min <= arc_c_max)) {
5248 arc_meta_min = zfs_arc_meta_min;
5249 arc_meta_limit = MAX(arc_meta_limit, arc_meta_min);
5250 }
5251
5252 /* Valid range: <arc_meta_min> - <arc_c_max> */
5253 if ((zfs_arc_meta_limit) && (zfs_arc_meta_limit != arc_meta_limit) &&
5254 (zfs_arc_meta_limit >= zfs_arc_meta_min) &&
5255 (zfs_arc_meta_limit <= arc_c_max))
5256 arc_meta_limit = zfs_arc_meta_limit;
5257
5258 /* Valid range: 1 - N */
5259 if (zfs_arc_grow_retry)
5260 arc_grow_retry = zfs_arc_grow_retry;
5261
5262 /* Valid range: 1 - N */
5263 if (zfs_arc_shrink_shift) {
5264 arc_shrink_shift = zfs_arc_shrink_shift;
5265 arc_no_grow_shift = MIN(arc_no_grow_shift, arc_shrink_shift -1);
5266 }
5267
5268 /* Valid range: 1 - N ticks */
5269 if (zfs_arc_min_prefetch_lifespan)
5270 arc_min_prefetch_lifespan = zfs_arc_min_prefetch_lifespan;
5271}
5272
34dc7c2f
BB
5273void
5274arc_init(void)
5275{
ca67b33a
MA
5276 /*
5277 * allmem is "all memory that we could possibly use".
5278 */
5279#ifdef _KERNEL
5280 uint64_t allmem = ptob(physmem);
5281#else
5282 uint64_t allmem = (physmem * PAGESIZE) / 2;
5283#endif
5284
ca0bf58d
PS
5285 mutex_init(&arc_reclaim_lock, NULL, MUTEX_DEFAULT, NULL);
5286 cv_init(&arc_reclaim_thread_cv, NULL, CV_DEFAULT, NULL);
5287 cv_init(&arc_reclaim_waiters_cv, NULL, CV_DEFAULT, NULL);
5288
5289 mutex_init(&arc_user_evicts_lock, NULL, MUTEX_DEFAULT, NULL);
5290 cv_init(&arc_user_evicts_cv, NULL, CV_DEFAULT, NULL);
34dc7c2f
BB
5291
5292 /* Convert seconds to clock ticks */
ca67b33a 5293 arc_min_prefetch_lifespan = 1 * hz;
34dc7c2f
BB
5294
5295 /* Start out with 1/8 of all memory */
ca67b33a 5296 arc_c = allmem / 8;
34dc7c2f
BB
5297
5298#ifdef _KERNEL
5299 /*
5300 * On architectures where the physical memory can be larger
5301 * than the addressable space (intel in 32-bit mode), we may
5302 * need to limit the cache to 1/8 of VM size.
5303 */
5304 arc_c = MIN(arc_c, vmem_size(heap_arena, VMEM_ALLOC | VMEM_FREE) / 8);
ca67b33a 5305
7cb67b45
BB
5306 /*
5307 * Register a shrinker to support synchronous (direct) memory
5308 * reclaim from the arc. This is done to prevent kswapd from
5309 * swapping out pages when it is preferable to shrink the arc.
5310 */
5311 spl_register_shrinker(&arc_shrinker);
34dc7c2f
BB
5312#endif
5313
ca67b33a 5314 /* Set min cache to allow safe operation of arc_adapt() */
121b3cae 5315 arc_c_min = 2ULL << SPA_MAXBLOCKSHIFT;
ca67b33a
MA
5316 /* Set max to 1/2 of all memory */
5317 arc_c_max = allmem / 2;
34dc7c2f
BB
5318
5319 arc_c = arc_c_max;
5320 arc_p = (arc_c >> 1);
5321
ca67b33a
MA
5322 /* Set min to 1/2 of arc_c_min */
5323 arc_meta_min = 1ULL << SPA_MAXBLOCKSHIFT;
5324 /* Initialize maximum observed usage to zero */
1834f2d8 5325 arc_meta_max = 0;
ca67b33a
MA
5326 /* Set limit to 3/4 of arc_c_max with a floor of arc_meta_min */
5327 arc_meta_limit = MAX((3 * arc_c_max) / 4, arc_meta_min);
34dc7c2f 5328
ca67b33a
MA
5329 /* Apply user specified tunings */
5330 arc_tuning_update();
c52fca13 5331
ca0bf58d 5332 if (zfs_arc_num_sublists_per_state < 1)
ca67b33a 5333 zfs_arc_num_sublists_per_state = MAX(boot_ncpus, 1);
ca0bf58d 5334
34dc7c2f
BB
5335 /* if kmem_flags are set, lets try to use less memory */
5336 if (kmem_debugging())
5337 arc_c = arc_c / 2;
5338 if (arc_c < arc_c_min)
5339 arc_c = arc_c_min;
5340
5341 arc_anon = &ARC_anon;
5342 arc_mru = &ARC_mru;
5343 arc_mru_ghost = &ARC_mru_ghost;
5344 arc_mfu = &ARC_mfu;
5345 arc_mfu_ghost = &ARC_mfu_ghost;
5346 arc_l2c_only = &ARC_l2c_only;
5347 arc_size = 0;
5348
ca0bf58d 5349 multilist_create(&arc_mru->arcs_list[ARC_BUFC_METADATA],
b9541d6b 5350 sizeof (arc_buf_hdr_t),
ca0bf58d
PS
5351 offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node),
5352 zfs_arc_num_sublists_per_state, arc_state_multilist_index_func);
5353 multilist_create(&arc_mru->arcs_list[ARC_BUFC_DATA],
b9541d6b 5354 sizeof (arc_buf_hdr_t),
ca0bf58d
PS
5355 offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node),
5356 zfs_arc_num_sublists_per_state, arc_state_multilist_index_func);
5357 multilist_create(&arc_mru_ghost->arcs_list[ARC_BUFC_METADATA],
b9541d6b 5358 sizeof (arc_buf_hdr_t),
ca0bf58d
PS
5359 offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node),
5360 zfs_arc_num_sublists_per_state, arc_state_multilist_index_func);
5361 multilist_create(&arc_mru_ghost->arcs_list[ARC_BUFC_DATA],
b9541d6b 5362 sizeof (arc_buf_hdr_t),
ca0bf58d
PS
5363 offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node),
5364 zfs_arc_num_sublists_per_state, arc_state_multilist_index_func);
5365 multilist_create(&arc_mfu->arcs_list[ARC_BUFC_METADATA],
b9541d6b 5366 sizeof (arc_buf_hdr_t),
ca0bf58d
PS
5367 offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node),
5368 zfs_arc_num_sublists_per_state, arc_state_multilist_index_func);
5369 multilist_create(&arc_mfu->arcs_list[ARC_BUFC_DATA],
b9541d6b 5370 sizeof (arc_buf_hdr_t),
ca0bf58d
PS
5371 offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node),
5372 zfs_arc_num_sublists_per_state, arc_state_multilist_index_func);
5373 multilist_create(&arc_mfu_ghost->arcs_list[ARC_BUFC_METADATA],
b9541d6b 5374 sizeof (arc_buf_hdr_t),
ca0bf58d
PS
5375 offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node),
5376 zfs_arc_num_sublists_per_state, arc_state_multilist_index_func);
5377 multilist_create(&arc_mfu_ghost->arcs_list[ARC_BUFC_DATA],
b9541d6b 5378 sizeof (arc_buf_hdr_t),
ca0bf58d
PS
5379 offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node),
5380 zfs_arc_num_sublists_per_state, arc_state_multilist_index_func);
5381 multilist_create(&arc_l2c_only->arcs_list[ARC_BUFC_METADATA],
b9541d6b 5382 sizeof (arc_buf_hdr_t),
ca0bf58d
PS
5383 offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node),
5384 zfs_arc_num_sublists_per_state, arc_state_multilist_index_func);
5385 multilist_create(&arc_l2c_only->arcs_list[ARC_BUFC_DATA],
b9541d6b 5386 sizeof (arc_buf_hdr_t),
ca0bf58d
PS
5387 offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node),
5388 zfs_arc_num_sublists_per_state, arc_state_multilist_index_func);
34dc7c2f 5389
e0b0ca98
BB
5390 arc_anon->arcs_state = ARC_STATE_ANON;
5391 arc_mru->arcs_state = ARC_STATE_MRU;
5392 arc_mru_ghost->arcs_state = ARC_STATE_MRU_GHOST;
5393 arc_mfu->arcs_state = ARC_STATE_MFU;
5394 arc_mfu_ghost->arcs_state = ARC_STATE_MFU_GHOST;
5395 arc_l2c_only->arcs_state = ARC_STATE_L2C_ONLY;
5396
36da08ef
PS
5397 refcount_create(&arc_anon->arcs_size);
5398 refcount_create(&arc_mru->arcs_size);
5399 refcount_create(&arc_mru_ghost->arcs_size);
5400 refcount_create(&arc_mfu->arcs_size);
5401 refcount_create(&arc_mfu_ghost->arcs_size);
5402 refcount_create(&arc_l2c_only->arcs_size);
5403
34dc7c2f
BB
5404 buf_init();
5405
ca0bf58d
PS
5406 arc_reclaim_thread_exit = FALSE;
5407 arc_user_evicts_thread_exit = FALSE;
ab26409d
BB
5408 list_create(&arc_prune_list, sizeof (arc_prune_t),
5409 offsetof(arc_prune_t, p_node));
34dc7c2f 5410 arc_eviction_list = NULL;
ab26409d 5411 mutex_init(&arc_prune_mtx, NULL, MUTEX_DEFAULT, NULL);
34dc7c2f
BB
5412 bzero(&arc_eviction_hdr, sizeof (arc_buf_hdr_t));
5413
f6046738 5414 arc_prune_taskq = taskq_create("arc_prune", max_ncpus, minclsyspri,
aa9af22c 5415 max_ncpus, INT_MAX, TASKQ_PREPOPULATE | TASKQ_DYNAMIC);
f6046738 5416
34dc7c2f
BB
5417 arc_ksp = kstat_create("zfs", 0, "arcstats", "misc", KSTAT_TYPE_NAMED,
5418 sizeof (arc_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL);
5419
5420 if (arc_ksp != NULL) {
5421 arc_ksp->ks_data = &arc_stats;
13be560d 5422 arc_ksp->ks_update = arc_kstat_update;
34dc7c2f
BB
5423 kstat_install(arc_ksp);
5424 }
5425
ca67b33a 5426 (void) thread_create(NULL, 0, arc_reclaim_thread, NULL, 0, &p0,
34dc7c2f
BB
5427 TS_RUN, minclsyspri);
5428
ca0bf58d
PS
5429 (void) thread_create(NULL, 0, arc_user_evicts_thread, NULL, 0, &p0,
5430 TS_RUN, minclsyspri);
5431
34dc7c2f 5432 arc_dead = FALSE;
b128c09f 5433 arc_warm = B_FALSE;
34dc7c2f 5434
e8b96c60
MA
5435 /*
5436 * Calculate maximum amount of dirty data per pool.
5437 *
5438 * If it has been set by a module parameter, take that.
5439 * Otherwise, use a percentage of physical memory defined by
5440 * zfs_dirty_data_max_percent (default 10%) with a cap at
5441 * zfs_dirty_data_max_max (default 25% of physical memory).
5442 */
5443 if (zfs_dirty_data_max_max == 0)
5444 zfs_dirty_data_max_max = physmem * PAGESIZE *
5445 zfs_dirty_data_max_max_percent / 100;
5446
5447 if (zfs_dirty_data_max == 0) {
5448 zfs_dirty_data_max = physmem * PAGESIZE *
5449 zfs_dirty_data_max_percent / 100;
5450 zfs_dirty_data_max = MIN(zfs_dirty_data_max,
5451 zfs_dirty_data_max_max);
5452 }
34dc7c2f
BB
5453}
5454
5455void
5456arc_fini(void)
5457{
ab26409d
BB
5458 arc_prune_t *p;
5459
7cb67b45
BB
5460#ifdef _KERNEL
5461 spl_unregister_shrinker(&arc_shrinker);
5462#endif /* _KERNEL */
5463
ca0bf58d
PS
5464 mutex_enter(&arc_reclaim_lock);
5465 arc_reclaim_thread_exit = TRUE;
5466 /*
5467 * The reclaim thread will set arc_reclaim_thread_exit back to
5468 * FALSE when it is finished exiting; we're waiting for that.
5469 */
5470 while (arc_reclaim_thread_exit) {
5471 cv_signal(&arc_reclaim_thread_cv);
5472 cv_wait(&arc_reclaim_thread_cv, &arc_reclaim_lock);
5473 }
5474 mutex_exit(&arc_reclaim_lock);
5475
5476 mutex_enter(&arc_user_evicts_lock);
5477 arc_user_evicts_thread_exit = TRUE;
5478 /*
5479 * The user evicts thread will set arc_user_evicts_thread_exit
5480 * to FALSE when it is finished exiting; we're waiting for that.
5481 */
5482 while (arc_user_evicts_thread_exit) {
5483 cv_signal(&arc_user_evicts_cv);
5484 cv_wait(&arc_user_evicts_cv, &arc_user_evicts_lock);
5485 }
5486 mutex_exit(&arc_user_evicts_lock);
34dc7c2f 5487
ca0bf58d
PS
5488 /* Use TRUE to ensure *all* buffers are evicted */
5489 arc_flush(NULL, TRUE);
34dc7c2f
BB
5490
5491 arc_dead = TRUE;
5492
5493 if (arc_ksp != NULL) {
5494 kstat_delete(arc_ksp);
5495 arc_ksp = NULL;
5496 }
5497
f6046738
BB
5498 taskq_wait(arc_prune_taskq);
5499 taskq_destroy(arc_prune_taskq);
5500
ab26409d
BB
5501 mutex_enter(&arc_prune_mtx);
5502 while ((p = list_head(&arc_prune_list)) != NULL) {
5503 list_remove(&arc_prune_list, p);
5504 refcount_remove(&p->p_refcnt, &arc_prune_list);
5505 refcount_destroy(&p->p_refcnt);
5506 kmem_free(p, sizeof (*p));
5507 }
5508 mutex_exit(&arc_prune_mtx);
5509
5510 list_destroy(&arc_prune_list);
5511 mutex_destroy(&arc_prune_mtx);
ca0bf58d
PS
5512 mutex_destroy(&arc_reclaim_lock);
5513 cv_destroy(&arc_reclaim_thread_cv);
5514 cv_destroy(&arc_reclaim_waiters_cv);
5515
5516 mutex_destroy(&arc_user_evicts_lock);
5517 cv_destroy(&arc_user_evicts_cv);
5518
36da08ef
PS
5519 refcount_destroy(&arc_anon->arcs_size);
5520 refcount_destroy(&arc_mru->arcs_size);
5521 refcount_destroy(&arc_mru_ghost->arcs_size);
5522 refcount_destroy(&arc_mfu->arcs_size);
5523 refcount_destroy(&arc_mfu_ghost->arcs_size);
5524 refcount_destroy(&arc_l2c_only->arcs_size);
5525
ca0bf58d
PS
5526 multilist_destroy(&arc_mru->arcs_list[ARC_BUFC_METADATA]);
5527 multilist_destroy(&arc_mru_ghost->arcs_list[ARC_BUFC_METADATA]);
5528 multilist_destroy(&arc_mfu->arcs_list[ARC_BUFC_METADATA]);
5529 multilist_destroy(&arc_mfu_ghost->arcs_list[ARC_BUFC_METADATA]);
5530 multilist_destroy(&arc_mru->arcs_list[ARC_BUFC_DATA]);
5531 multilist_destroy(&arc_mru_ghost->arcs_list[ARC_BUFC_DATA]);
5532 multilist_destroy(&arc_mfu->arcs_list[ARC_BUFC_DATA]);
5533 multilist_destroy(&arc_mfu_ghost->arcs_list[ARC_BUFC_DATA]);
5534 multilist_destroy(&arc_l2c_only->arcs_list[ARC_BUFC_METADATA]);
5535 multilist_destroy(&arc_l2c_only->arcs_list[ARC_BUFC_DATA]);
34dc7c2f
BB
5536
5537 buf_fini();
9babb374 5538
b9541d6b 5539 ASSERT0(arc_loaned_bytes);
34dc7c2f
BB
5540}
5541
5542/*
5543 * Level 2 ARC
5544 *
5545 * The level 2 ARC (L2ARC) is a cache layer in-between main memory and disk.
5546 * It uses dedicated storage devices to hold cached data, which are populated
5547 * using large infrequent writes. The main role of this cache is to boost
5548 * the performance of random read workloads. The intended L2ARC devices
5549 * include short-stroked disks, solid state disks, and other media with
5550 * substantially faster read latency than disk.
5551 *
5552 * +-----------------------+
5553 * | ARC |
5554 * +-----------------------+
5555 * | ^ ^
5556 * | | |
5557 * l2arc_feed_thread() arc_read()
5558 * | | |
5559 * | l2arc read |
5560 * V | |
5561 * +---------------+ |
5562 * | L2ARC | |
5563 * +---------------+ |
5564 * | ^ |
5565 * l2arc_write() | |
5566 * | | |
5567 * V | |
5568 * +-------+ +-------+
5569 * | vdev | | vdev |
5570 * | cache | | cache |
5571 * +-------+ +-------+
5572 * +=========+ .-----.
5573 * : L2ARC : |-_____-|
5574 * : devices : | Disks |
5575 * +=========+ `-_____-'
5576 *
5577 * Read requests are satisfied from the following sources, in order:
5578 *
5579 * 1) ARC
5580 * 2) vdev cache of L2ARC devices
5581 * 3) L2ARC devices
5582 * 4) vdev cache of disks
5583 * 5) disks
5584 *
5585 * Some L2ARC device types exhibit extremely slow write performance.
5586 * To accommodate for this there are some significant differences between
5587 * the L2ARC and traditional cache design:
5588 *
5589 * 1. There is no eviction path from the ARC to the L2ARC. Evictions from
5590 * the ARC behave as usual, freeing buffers and placing headers on ghost
5591 * lists. The ARC does not send buffers to the L2ARC during eviction as
5592 * this would add inflated write latencies for all ARC memory pressure.
5593 *
5594 * 2. The L2ARC attempts to cache data from the ARC before it is evicted.
5595 * It does this by periodically scanning buffers from the eviction-end of
5596 * the MFU and MRU ARC lists, copying them to the L2ARC devices if they are
3a17a7a9
SK
5597 * not already there. It scans until a headroom of buffers is satisfied,
5598 * which itself is a buffer for ARC eviction. If a compressible buffer is
5599 * found during scanning and selected for writing to an L2ARC device, we
5600 * temporarily boost scanning headroom during the next scan cycle to make
5601 * sure we adapt to compression effects (which might significantly reduce
5602 * the data volume we write to L2ARC). The thread that does this is
34dc7c2f
BB
5603 * l2arc_feed_thread(), illustrated below; example sizes are included to
5604 * provide a better sense of ratio than this diagram:
5605 *
5606 * head --> tail
5607 * +---------------------+----------+
5608 * ARC_mfu |:::::#:::::::::::::::|o#o###o###|-->. # already on L2ARC
5609 * +---------------------+----------+ | o L2ARC eligible
5610 * ARC_mru |:#:::::::::::::::::::|#o#ooo####|-->| : ARC buffer
5611 * +---------------------+----------+ |
5612 * 15.9 Gbytes ^ 32 Mbytes |
5613 * headroom |
5614 * l2arc_feed_thread()
5615 * |
5616 * l2arc write hand <--[oooo]--'
5617 * | 8 Mbyte
5618 * | write max
5619 * V
5620 * +==============================+
5621 * L2ARC dev |####|#|###|###| |####| ... |
5622 * +==============================+
5623 * 32 Gbytes
5624 *
5625 * 3. If an ARC buffer is copied to the L2ARC but then hit instead of
5626 * evicted, then the L2ARC has cached a buffer much sooner than it probably
5627 * needed to, potentially wasting L2ARC device bandwidth and storage. It is
5628 * safe to say that this is an uncommon case, since buffers at the end of
5629 * the ARC lists have moved there due to inactivity.
5630 *
5631 * 4. If the ARC evicts faster than the L2ARC can maintain a headroom,
5632 * then the L2ARC simply misses copying some buffers. This serves as a
5633 * pressure valve to prevent heavy read workloads from both stalling the ARC
5634 * with waits and clogging the L2ARC with writes. This also helps prevent
5635 * the potential for the L2ARC to churn if it attempts to cache content too
5636 * quickly, such as during backups of the entire pool.
5637 *
b128c09f
BB
5638 * 5. After system boot and before the ARC has filled main memory, there are
5639 * no evictions from the ARC and so the tails of the ARC_mfu and ARC_mru
5640 * lists can remain mostly static. Instead of searching from tail of these
5641 * lists as pictured, the l2arc_feed_thread() will search from the list heads
5642 * for eligible buffers, greatly increasing its chance of finding them.
5643 *
5644 * The L2ARC device write speed is also boosted during this time so that
5645 * the L2ARC warms up faster. Since there have been no ARC evictions yet,
5646 * there are no L2ARC reads, and no fear of degrading read performance
5647 * through increased writes.
5648 *
5649 * 6. Writes to the L2ARC devices are grouped and sent in-sequence, so that
34dc7c2f
BB
5650 * the vdev queue can aggregate them into larger and fewer writes. Each
5651 * device is written to in a rotor fashion, sweeping writes through
5652 * available space then repeating.
5653 *
b128c09f 5654 * 7. The L2ARC does not store dirty content. It never needs to flush
34dc7c2f
BB
5655 * write buffers back to disk based storage.
5656 *
b128c09f 5657 * 8. If an ARC buffer is written (and dirtied) which also exists in the
34dc7c2f
BB
5658 * L2ARC, the now stale L2ARC buffer is immediately dropped.
5659 *
5660 * The performance of the L2ARC can be tweaked by a number of tunables, which
5661 * may be necessary for different workloads:
5662 *
5663 * l2arc_write_max max write bytes per interval
b128c09f 5664 * l2arc_write_boost extra write bytes during device warmup
34dc7c2f 5665 * l2arc_noprefetch skip caching prefetched buffers
3a17a7a9 5666 * l2arc_nocompress skip compressing buffers
34dc7c2f 5667 * l2arc_headroom number of max device writes to precache
3a17a7a9
SK
5668 * l2arc_headroom_boost when we find compressed buffers during ARC
5669 * scanning, we multiply headroom by this
5670 * percentage factor for the next scan cycle,
5671 * since more compressed buffers are likely to
5672 * be present
34dc7c2f
BB
5673 * l2arc_feed_secs seconds between L2ARC writing
5674 *
5675 * Tunables may be removed or added as future performance improvements are
5676 * integrated, and also may become zpool properties.
d164b209
BB
5677 *
5678 * There are three key functions that control how the L2ARC warms up:
5679 *
5680 * l2arc_write_eligible() check if a buffer is eligible to cache
5681 * l2arc_write_size() calculate how much to write
5682 * l2arc_write_interval() calculate sleep delay between writes
5683 *
5684 * These three functions determine what to write, how much, and how quickly
5685 * to send writes.
34dc7c2f
BB
5686 */
5687
d164b209 5688static boolean_t
2a432414 5689l2arc_write_eligible(uint64_t spa_guid, arc_buf_hdr_t *hdr)
d164b209
BB
5690{
5691 /*
5692 * A buffer is *not* eligible for the L2ARC if it:
5693 * 1. belongs to a different spa.
428870ff
BB
5694 * 2. is already cached on the L2ARC.
5695 * 3. has an I/O in progress (it may be an incomplete read).
5696 * 4. is flagged not eligible (zfs property).
d164b209 5697 */
b9541d6b 5698 if (hdr->b_spa != spa_guid || HDR_HAS_L2HDR(hdr) ||
2a432414 5699 HDR_IO_IN_PROGRESS(hdr) || !HDR_L2CACHE(hdr))
d164b209
BB
5700 return (B_FALSE);
5701
5702 return (B_TRUE);
5703}
5704
5705static uint64_t
3a17a7a9 5706l2arc_write_size(void)
d164b209
BB
5707{
5708 uint64_t size;
5709
3a17a7a9
SK
5710 /*
5711 * Make sure our globals have meaningful values in case the user
5712 * altered them.
5713 */
5714 size = l2arc_write_max;
5715 if (size == 0) {
5716 cmn_err(CE_NOTE, "Bad value for l2arc_write_max, value must "
5717 "be greater than zero, resetting it to the default (%d)",
5718 L2ARC_WRITE_SIZE);
5719 size = l2arc_write_max = L2ARC_WRITE_SIZE;
5720 }
d164b209
BB
5721
5722 if (arc_warm == B_FALSE)
3a17a7a9 5723 size += l2arc_write_boost;
d164b209
BB
5724
5725 return (size);
5726
5727}
5728
5729static clock_t
5730l2arc_write_interval(clock_t began, uint64_t wanted, uint64_t wrote)
5731{
428870ff 5732 clock_t interval, next, now;
d164b209
BB
5733
5734 /*
5735 * If the ARC lists are busy, increase our write rate; if the
5736 * lists are stale, idle back. This is achieved by checking
5737 * how much we previously wrote - if it was more than half of
5738 * what we wanted, schedule the next write much sooner.
5739 */
5740 if (l2arc_feed_again && wrote > (wanted / 2))
5741 interval = (hz * l2arc_feed_min_ms) / 1000;
5742 else
5743 interval = hz * l2arc_feed_secs;
5744
428870ff
BB
5745 now = ddi_get_lbolt();
5746 next = MAX(now, MIN(now + interval, began + interval));
d164b209
BB
5747
5748 return (next);
5749}
5750
34dc7c2f
BB
5751/*
5752 * Cycle through L2ARC devices. This is how L2ARC load balances.
b128c09f 5753 * If a device is returned, this also returns holding the spa config lock.
34dc7c2f
BB
5754 */
5755static l2arc_dev_t *
5756l2arc_dev_get_next(void)
5757{
b128c09f 5758 l2arc_dev_t *first, *next = NULL;
34dc7c2f 5759
b128c09f
BB
5760 /*
5761 * Lock out the removal of spas (spa_namespace_lock), then removal
5762 * of cache devices (l2arc_dev_mtx). Once a device has been selected,
5763 * both locks will be dropped and a spa config lock held instead.
5764 */
5765 mutex_enter(&spa_namespace_lock);
5766 mutex_enter(&l2arc_dev_mtx);
5767
5768 /* if there are no vdevs, there is nothing to do */
5769 if (l2arc_ndev == 0)
5770 goto out;
5771
5772 first = NULL;
5773 next = l2arc_dev_last;
5774 do {
5775 /* loop around the list looking for a non-faulted vdev */
5776 if (next == NULL) {
34dc7c2f 5777 next = list_head(l2arc_dev_list);
b128c09f
BB
5778 } else {
5779 next = list_next(l2arc_dev_list, next);
5780 if (next == NULL)
5781 next = list_head(l2arc_dev_list);
5782 }
5783
5784 /* if we have come back to the start, bail out */
5785 if (first == NULL)
5786 first = next;
5787 else if (next == first)
5788 break;
5789
5790 } while (vdev_is_dead(next->l2ad_vdev));
5791
5792 /* if we were unable to find any usable vdevs, return NULL */
5793 if (vdev_is_dead(next->l2ad_vdev))
5794 next = NULL;
34dc7c2f
BB
5795
5796 l2arc_dev_last = next;
5797
b128c09f
BB
5798out:
5799 mutex_exit(&l2arc_dev_mtx);
5800
5801 /*
5802 * Grab the config lock to prevent the 'next' device from being
5803 * removed while we are writing to it.
5804 */
5805 if (next != NULL)
5806 spa_config_enter(next->l2ad_spa, SCL_L2ARC, next, RW_READER);
5807 mutex_exit(&spa_namespace_lock);
5808
34dc7c2f
BB
5809 return (next);
5810}
5811
b128c09f
BB
5812/*
5813 * Free buffers that were tagged for destruction.
5814 */
5815static void
0bc8fd78 5816l2arc_do_free_on_write(void)
b128c09f
BB
5817{
5818 list_t *buflist;
5819 l2arc_data_free_t *df, *df_prev;
5820
5821 mutex_enter(&l2arc_free_on_write_mtx);
5822 buflist = l2arc_free_on_write;
5823
5824 for (df = list_tail(buflist); df; df = df_prev) {
5825 df_prev = list_prev(buflist, df);
5826 ASSERT(df->l2df_data != NULL);
5827 ASSERT(df->l2df_func != NULL);
5828 df->l2df_func(df->l2df_data, df->l2df_size);
5829 list_remove(buflist, df);
5830 kmem_free(df, sizeof (l2arc_data_free_t));
5831 }
5832
5833 mutex_exit(&l2arc_free_on_write_mtx);
5834}
5835
34dc7c2f
BB
5836/*
5837 * A write to a cache device has completed. Update all headers to allow
5838 * reads from these buffers to begin.
5839 */
5840static void
5841l2arc_write_done(zio_t *zio)
5842{
5843 l2arc_write_callback_t *cb;
5844 l2arc_dev_t *dev;
5845 list_t *buflist;
2a432414 5846 arc_buf_hdr_t *head, *hdr, *hdr_prev;
34dc7c2f 5847 kmutex_t *hash_lock;
3bec585e 5848 int64_t bytes_dropped = 0;
34dc7c2f
BB
5849
5850 cb = zio->io_private;
5851 ASSERT(cb != NULL);
5852 dev = cb->l2wcb_dev;
5853 ASSERT(dev != NULL);
5854 head = cb->l2wcb_head;
5855 ASSERT(head != NULL);
b9541d6b 5856 buflist = &dev->l2ad_buflist;
34dc7c2f
BB
5857 ASSERT(buflist != NULL);
5858 DTRACE_PROBE2(l2arc__iodone, zio_t *, zio,
5859 l2arc_write_callback_t *, cb);
5860
5861 if (zio->io_error != 0)
5862 ARCSTAT_BUMP(arcstat_l2_writes_error);
5863
34dc7c2f
BB
5864 /*
5865 * All writes completed, or an error was hit.
5866 */
ca0bf58d
PS
5867top:
5868 mutex_enter(&dev->l2ad_mtx);
2a432414
GW
5869 for (hdr = list_prev(buflist, head); hdr; hdr = hdr_prev) {
5870 hdr_prev = list_prev(buflist, hdr);
34dc7c2f 5871
2a432414 5872 hash_lock = HDR_LOCK(hdr);
ca0bf58d
PS
5873
5874 /*
5875 * We cannot use mutex_enter or else we can deadlock
5876 * with l2arc_write_buffers (due to swapping the order
5877 * the hash lock and l2ad_mtx are taken).
5878 */
34dc7c2f
BB
5879 if (!mutex_tryenter(hash_lock)) {
5880 /*
ca0bf58d
PS
5881 * Missed the hash lock. We must retry so we
5882 * don't leave the ARC_FLAG_L2_WRITING bit set.
34dc7c2f 5883 */
ca0bf58d
PS
5884 ARCSTAT_BUMP(arcstat_l2_writes_lock_retry);
5885
5886 /*
5887 * We don't want to rescan the headers we've
5888 * already marked as having been written out, so
5889 * we reinsert the head node so we can pick up
5890 * where we left off.
5891 */
5892 list_remove(buflist, head);
5893 list_insert_after(buflist, hdr, head);
5894
5895 mutex_exit(&dev->l2ad_mtx);
5896
5897 /*
5898 * We wait for the hash lock to become available
5899 * to try and prevent busy waiting, and increase
5900 * the chance we'll be able to acquire the lock
5901 * the next time around.
5902 */
5903 mutex_enter(hash_lock);
5904 mutex_exit(hash_lock);
5905 goto top;
34dc7c2f
BB
5906 }
5907
b9541d6b 5908 /*
ca0bf58d
PS
5909 * We could not have been moved into the arc_l2c_only
5910 * state while in-flight due to our ARC_FLAG_L2_WRITING
5911 * bit being set. Let's just ensure that's being enforced.
5912 */
5913 ASSERT(HDR_HAS_L1HDR(hdr));
5914
5915 /*
5916 * We may have allocated a buffer for L2ARC compression,
5917 * we must release it to avoid leaking this data.
b9541d6b 5918 */
ca0bf58d 5919 l2arc_release_cdata_buf(hdr);
b9541d6b 5920
34dc7c2f
BB
5921 if (zio->io_error != 0) {
5922 /*
b128c09f 5923 * Error - drop L2ARC entry.
34dc7c2f 5924 */
2a432414 5925 list_remove(buflist, hdr);
b9541d6b
CW
5926 hdr->b_flags &= ~ARC_FLAG_HAS_L2HDR;
5927
5928 ARCSTAT_INCR(arcstat_l2_asize, -hdr->b_l2hdr.b_asize);
2a432414 5929 ARCSTAT_INCR(arcstat_l2_size, -hdr->b_size);
d962d5da
PS
5930
5931 bytes_dropped += hdr->b_l2hdr.b_asize;
5932 (void) refcount_remove_many(&dev->l2ad_alloc,
5933 hdr->b_l2hdr.b_asize, hdr);
34dc7c2f
BB
5934 }
5935
5936 /*
ca0bf58d
PS
5937 * Allow ARC to begin reads and ghost list evictions to
5938 * this L2ARC entry.
34dc7c2f 5939 */
2a432414 5940 hdr->b_flags &= ~ARC_FLAG_L2_WRITING;
34dc7c2f
BB
5941
5942 mutex_exit(hash_lock);
5943 }
5944
5945 atomic_inc_64(&l2arc_writes_done);
5946 list_remove(buflist, head);
b9541d6b
CW
5947 ASSERT(!HDR_HAS_L1HDR(head));
5948 kmem_cache_free(hdr_l2only_cache, head);
5949 mutex_exit(&dev->l2ad_mtx);
34dc7c2f 5950
3bec585e
SK
5951 vdev_space_update(dev->l2ad_vdev, -bytes_dropped, 0, 0);
5952
b128c09f 5953 l2arc_do_free_on_write();
34dc7c2f
BB
5954
5955 kmem_free(cb, sizeof (l2arc_write_callback_t));
5956}
5957
5958/*
5959 * A read to a cache device completed. Validate buffer contents before
5960 * handing over to the regular ARC routines.
5961 */
5962static void
5963l2arc_read_done(zio_t *zio)
5964{
5965 l2arc_read_callback_t *cb;
5966 arc_buf_hdr_t *hdr;
5967 arc_buf_t *buf;
34dc7c2f 5968 kmutex_t *hash_lock;
b128c09f
BB
5969 int equal;
5970
5971 ASSERT(zio->io_vd != NULL);
5972 ASSERT(zio->io_flags & ZIO_FLAG_DONT_PROPAGATE);
5973
5974 spa_config_exit(zio->io_spa, SCL_L2ARC, zio->io_vd);
34dc7c2f
BB
5975
5976 cb = zio->io_private;
5977 ASSERT(cb != NULL);
5978 buf = cb->l2rcb_buf;
5979 ASSERT(buf != NULL);
34dc7c2f 5980
428870ff 5981 hash_lock = HDR_LOCK(buf->b_hdr);
34dc7c2f 5982 mutex_enter(hash_lock);
428870ff
BB
5983 hdr = buf->b_hdr;
5984 ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
34dc7c2f 5985
3a17a7a9
SK
5986 /*
5987 * If the buffer was compressed, decompress it first.
5988 */
5989 if (cb->l2rcb_compress != ZIO_COMPRESS_OFF)
5990 l2arc_decompress_zio(zio, hdr, cb->l2rcb_compress);
5991 ASSERT(zio->io_data != NULL);
5992
34dc7c2f
BB
5993 /*
5994 * Check this survived the L2ARC journey.
5995 */
5996 equal = arc_cksum_equal(buf);
5997 if (equal && zio->io_error == 0 && !HDR_L2_EVICTED(hdr)) {
5998 mutex_exit(hash_lock);
5999 zio->io_private = buf;
b128c09f
BB
6000 zio->io_bp_copy = cb->l2rcb_bp; /* XXX fix in L2ARC 2.0 */
6001 zio->io_bp = &zio->io_bp_copy; /* XXX fix in L2ARC 2.0 */
34dc7c2f
BB
6002 arc_read_done(zio);
6003 } else {
6004 mutex_exit(hash_lock);
6005 /*
6006 * Buffer didn't survive caching. Increment stats and
6007 * reissue to the original storage device.
6008 */
b128c09f 6009 if (zio->io_error != 0) {
34dc7c2f 6010 ARCSTAT_BUMP(arcstat_l2_io_error);
b128c09f 6011 } else {
2e528b49 6012 zio->io_error = SET_ERROR(EIO);
b128c09f 6013 }
34dc7c2f
BB
6014 if (!equal)
6015 ARCSTAT_BUMP(arcstat_l2_cksum_bad);
6016
34dc7c2f 6017 /*
b128c09f
BB
6018 * If there's no waiter, issue an async i/o to the primary
6019 * storage now. If there *is* a waiter, the caller must
6020 * issue the i/o in a context where it's OK to block.
34dc7c2f 6021 */
d164b209
BB
6022 if (zio->io_waiter == NULL) {
6023 zio_t *pio = zio_unique_parent(zio);
6024
6025 ASSERT(!pio || pio->io_child_type == ZIO_CHILD_LOGICAL);
6026
6027 zio_nowait(zio_read(pio, cb->l2rcb_spa, &cb->l2rcb_bp,
b128c09f
BB
6028 buf->b_data, zio->io_size, arc_read_done, buf,
6029 zio->io_priority, cb->l2rcb_flags, &cb->l2rcb_zb));
d164b209 6030 }
34dc7c2f
BB
6031 }
6032
6033 kmem_free(cb, sizeof (l2arc_read_callback_t));
6034}
6035
6036/*
6037 * This is the list priority from which the L2ARC will search for pages to
6038 * cache. This is used within loops (0..3) to cycle through lists in the
6039 * desired order. This order can have a significant effect on cache
6040 * performance.
6041 *
6042 * Currently the metadata lists are hit first, MFU then MRU, followed by
6043 * the data lists. This function returns a locked list, and also returns
6044 * the lock pointer.
6045 */
ca0bf58d
PS
6046static multilist_sublist_t *
6047l2arc_sublist_lock(int list_num)
34dc7c2f 6048{
ca0bf58d
PS
6049 multilist_t *ml = NULL;
6050 unsigned int idx;
34dc7c2f
BB
6051
6052 ASSERT(list_num >= 0 && list_num <= 3);
6053
6054 switch (list_num) {
6055 case 0:
ca0bf58d 6056 ml = &arc_mfu->arcs_list[ARC_BUFC_METADATA];
34dc7c2f
BB
6057 break;
6058 case 1:
ca0bf58d 6059 ml = &arc_mru->arcs_list[ARC_BUFC_METADATA];
34dc7c2f
BB
6060 break;
6061 case 2:
ca0bf58d 6062 ml = &arc_mfu->arcs_list[ARC_BUFC_DATA];
34dc7c2f
BB
6063 break;
6064 case 3:
ca0bf58d 6065 ml = &arc_mru->arcs_list[ARC_BUFC_DATA];
34dc7c2f
BB
6066 break;
6067 }
6068
ca0bf58d
PS
6069 /*
6070 * Return a randomly-selected sublist. This is acceptable
6071 * because the caller feeds only a little bit of data for each
6072 * call (8MB). Subsequent calls will result in different
6073 * sublists being selected.
6074 */
6075 idx = multilist_get_random_index(ml);
6076 return (multilist_sublist_lock(ml, idx));
34dc7c2f
BB
6077}
6078
6079/*
6080 * Evict buffers from the device write hand to the distance specified in
6081 * bytes. This distance may span populated buffers, it may span nothing.
6082 * This is clearing a region on the L2ARC device ready for writing.
6083 * If the 'all' boolean is set, every buffer is evicted.
6084 */
6085static void
6086l2arc_evict(l2arc_dev_t *dev, uint64_t distance, boolean_t all)
6087{
6088 list_t *buflist;
2a432414 6089 arc_buf_hdr_t *hdr, *hdr_prev;
34dc7c2f
BB
6090 kmutex_t *hash_lock;
6091 uint64_t taddr;
6092
b9541d6b 6093 buflist = &dev->l2ad_buflist;
34dc7c2f
BB
6094
6095 if (!all && dev->l2ad_first) {
6096 /*
6097 * This is the first sweep through the device. There is
6098 * nothing to evict.
6099 */
6100 return;
6101 }
6102
b128c09f 6103 if (dev->l2ad_hand >= (dev->l2ad_end - (2 * distance))) {
34dc7c2f
BB
6104 /*
6105 * When nearing the end of the device, evict to the end
6106 * before the device write hand jumps to the start.
6107 */
6108 taddr = dev->l2ad_end;
6109 } else {
6110 taddr = dev->l2ad_hand + distance;
6111 }
6112 DTRACE_PROBE4(l2arc__evict, l2arc_dev_t *, dev, list_t *, buflist,
6113 uint64_t, taddr, boolean_t, all);
6114
6115top:
b9541d6b 6116 mutex_enter(&dev->l2ad_mtx);
2a432414
GW
6117 for (hdr = list_tail(buflist); hdr; hdr = hdr_prev) {
6118 hdr_prev = list_prev(buflist, hdr);
34dc7c2f 6119
2a432414 6120 hash_lock = HDR_LOCK(hdr);
ca0bf58d
PS
6121
6122 /*
6123 * We cannot use mutex_enter or else we can deadlock
6124 * with l2arc_write_buffers (due to swapping the order
6125 * the hash lock and l2ad_mtx are taken).
6126 */
34dc7c2f
BB
6127 if (!mutex_tryenter(hash_lock)) {
6128 /*
6129 * Missed the hash lock. Retry.
6130 */
6131 ARCSTAT_BUMP(arcstat_l2_evict_lock_retry);
b9541d6b 6132 mutex_exit(&dev->l2ad_mtx);
34dc7c2f
BB
6133 mutex_enter(hash_lock);
6134 mutex_exit(hash_lock);
6135 goto top;
6136 }
6137
2a432414 6138 if (HDR_L2_WRITE_HEAD(hdr)) {
34dc7c2f
BB
6139 /*
6140 * We hit a write head node. Leave it for
6141 * l2arc_write_done().
6142 */
2a432414 6143 list_remove(buflist, hdr);
34dc7c2f
BB
6144 mutex_exit(hash_lock);
6145 continue;
6146 }
6147
b9541d6b
CW
6148 if (!all && HDR_HAS_L2HDR(hdr) &&
6149 (hdr->b_l2hdr.b_daddr > taddr ||
6150 hdr->b_l2hdr.b_daddr < dev->l2ad_hand)) {
34dc7c2f
BB
6151 /*
6152 * We've evicted to the target address,
6153 * or the end of the device.
6154 */
6155 mutex_exit(hash_lock);
6156 break;
6157 }
6158
b9541d6b
CW
6159 ASSERT(HDR_HAS_L2HDR(hdr));
6160 if (!HDR_HAS_L1HDR(hdr)) {
2a432414 6161 ASSERT(!HDR_L2_READING(hdr));
34dc7c2f
BB
6162 /*
6163 * This doesn't exist in the ARC. Destroy.
6164 * arc_hdr_destroy() will call list_remove()
6165 * and decrement arcstat_l2_size.
6166 */
2a432414
GW
6167 arc_change_state(arc_anon, hdr, hash_lock);
6168 arc_hdr_destroy(hdr);
34dc7c2f 6169 } else {
b9541d6b
CW
6170 ASSERT(hdr->b_l1hdr.b_state != arc_l2c_only);
6171 ARCSTAT_BUMP(arcstat_l2_evict_l1cached);
b128c09f
BB
6172 /*
6173 * Invalidate issued or about to be issued
6174 * reads, since we may be about to write
6175 * over this location.
6176 */
2a432414 6177 if (HDR_L2_READING(hdr)) {
b128c09f 6178 ARCSTAT_BUMP(arcstat_l2_evict_reading);
2a432414 6179 hdr->b_flags |= ARC_FLAG_L2_EVICTED;
b128c09f
BB
6180 }
6181
ca0bf58d
PS
6182 /* Ensure this header has finished being written */
6183 ASSERT(!HDR_L2_WRITING(hdr));
6184 ASSERT3P(hdr->b_l1hdr.b_tmp_cdata, ==, NULL);
d962d5da
PS
6185
6186 arc_hdr_l2hdr_destroy(hdr);
34dc7c2f
BB
6187 }
6188 mutex_exit(hash_lock);
6189 }
b9541d6b 6190 mutex_exit(&dev->l2ad_mtx);
34dc7c2f
BB
6191}
6192
6193/*
6194 * Find and write ARC buffers to the L2ARC device.
6195 *
2a432414 6196 * An ARC_FLAG_L2_WRITING flag is set so that the L2ARC buffers are not valid
34dc7c2f 6197 * for reading until they have completed writing.
3a17a7a9
SK
6198 * The headroom_boost is an in-out parameter used to maintain headroom boost
6199 * state between calls to this function.
6200 *
6201 * Returns the number of bytes actually written (which may be smaller than
6202 * the delta by which the device hand has changed due to alignment).
34dc7c2f 6203 */
d164b209 6204static uint64_t
3a17a7a9
SK
6205l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz,
6206 boolean_t *headroom_boost)
34dc7c2f 6207{
2a432414 6208 arc_buf_hdr_t *hdr, *hdr_prev, *head;
ef56b078
AG
6209 uint64_t write_asize, write_sz, headroom, buf_compress_minsz,
6210 stats_size;
34dc7c2f 6211 void *buf_data;
3a17a7a9 6212 boolean_t full;
34dc7c2f
BB
6213 l2arc_write_callback_t *cb;
6214 zio_t *pio, *wzio;
3541dc6d 6215 uint64_t guid = spa_load_guid(spa);
d6320ddb 6216 int try;
3a17a7a9 6217 const boolean_t do_headroom_boost = *headroom_boost;
34dc7c2f 6218
34dc7c2f
BB
6219 ASSERT(dev->l2ad_vdev != NULL);
6220
3a17a7a9
SK
6221 /* Lower the flag now, we might want to raise it again later. */
6222 *headroom_boost = B_FALSE;
6223
34dc7c2f 6224 pio = NULL;
ef56b078 6225 write_sz = write_asize = 0;
34dc7c2f 6226 full = B_FALSE;
b9541d6b 6227 head = kmem_cache_alloc(hdr_l2only_cache, KM_PUSHPAGE);
2a432414 6228 head->b_flags |= ARC_FLAG_L2_WRITE_HEAD;
b9541d6b 6229 head->b_flags |= ARC_FLAG_HAS_L2HDR;
34dc7c2f 6230
3a17a7a9
SK
6231 /*
6232 * We will want to try to compress buffers that are at least 2x the
6233 * device sector size.
6234 */
6235 buf_compress_minsz = 2 << dev->l2ad_vdev->vdev_ashift;
6236
34dc7c2f
BB
6237 /*
6238 * Copy buffers for L2ARC writing.
6239 */
d6320ddb 6240 for (try = 0; try <= 3; try++) {
ca0bf58d 6241 multilist_sublist_t *mls = l2arc_sublist_lock(try);
3a17a7a9
SK
6242 uint64_t passed_sz = 0;
6243
b128c09f
BB
6244 /*
6245 * L2ARC fast warmup.
6246 *
6247 * Until the ARC is warm and starts to evict, read from the
6248 * head of the ARC lists rather than the tail.
6249 */
b128c09f 6250 if (arc_warm == B_FALSE)
ca0bf58d 6251 hdr = multilist_sublist_head(mls);
b128c09f 6252 else
ca0bf58d 6253 hdr = multilist_sublist_tail(mls);
b128c09f 6254
3a17a7a9
SK
6255 headroom = target_sz * l2arc_headroom;
6256 if (do_headroom_boost)
6257 headroom = (headroom * l2arc_headroom_boost) / 100;
6258
2a432414 6259 for (; hdr; hdr = hdr_prev) {
3a17a7a9
SK
6260 kmutex_t *hash_lock;
6261 uint64_t buf_sz;
ef56b078 6262 uint64_t buf_a_sz;
3a17a7a9 6263
b128c09f 6264 if (arc_warm == B_FALSE)
ca0bf58d 6265 hdr_prev = multilist_sublist_next(mls, hdr);
b128c09f 6266 else
ca0bf58d 6267 hdr_prev = multilist_sublist_prev(mls, hdr);
34dc7c2f 6268
2a432414 6269 hash_lock = HDR_LOCK(hdr);
3a17a7a9 6270 if (!mutex_tryenter(hash_lock)) {
34dc7c2f
BB
6271 /*
6272 * Skip this buffer rather than waiting.
6273 */
6274 continue;
6275 }
6276
2a432414 6277 passed_sz += hdr->b_size;
34dc7c2f
BB
6278 if (passed_sz > headroom) {
6279 /*
6280 * Searched too far.
6281 */
6282 mutex_exit(hash_lock);
6283 break;
6284 }
6285
2a432414 6286 if (!l2arc_write_eligible(guid, hdr)) {
34dc7c2f
BB
6287 mutex_exit(hash_lock);
6288 continue;
6289 }
6290
ef56b078
AG
6291 /*
6292 * Assume that the buffer is not going to be compressed
6293 * and could take more space on disk because of a larger
6294 * disk block size.
6295 */
6296 buf_sz = hdr->b_size;
6297 buf_a_sz = vdev_psize_to_asize(dev->l2ad_vdev, buf_sz);
6298
6299 if ((write_asize + buf_a_sz) > target_sz) {
34dc7c2f
BB
6300 full = B_TRUE;
6301 mutex_exit(hash_lock);
6302 break;
6303 }
6304
34dc7c2f
BB
6305 if (pio == NULL) {
6306 /*
6307 * Insert a dummy header on the buflist so
6308 * l2arc_write_done() can find where the
6309 * write buffers begin without searching.
6310 */
ca0bf58d 6311 mutex_enter(&dev->l2ad_mtx);
b9541d6b 6312 list_insert_head(&dev->l2ad_buflist, head);
ca0bf58d 6313 mutex_exit(&dev->l2ad_mtx);
34dc7c2f 6314
409dc1a5 6315 cb = kmem_alloc(sizeof (l2arc_write_callback_t),
79c76d5b 6316 KM_SLEEP);
34dc7c2f
BB
6317 cb->l2wcb_dev = dev;
6318 cb->l2wcb_head = head;
6319 pio = zio_root(spa, l2arc_write_done, cb,
6320 ZIO_FLAG_CANFAIL);
6321 }
6322
6323 /*
6324 * Create and add a new L2ARC header.
6325 */
b9541d6b
CW
6326 hdr->b_l2hdr.b_dev = dev;
6327 arc_space_consume(HDR_L2ONLY_SIZE, ARC_SPACE_L2HDRS);
2a432414 6328 hdr->b_flags |= ARC_FLAG_L2_WRITING;
3a17a7a9
SK
6329 /*
6330 * Temporarily stash the data buffer in b_tmp_cdata.
6331 * The subsequent write step will pick it up from
b9541d6b 6332 * there. This is because can't access b_l1hdr.b_buf
3a17a7a9
SK
6333 * without holding the hash_lock, which we in turn
6334 * can't access without holding the ARC list locks
6335 * (which we want to avoid during compression/writing)
6336 */
b9541d6b
CW
6337 HDR_SET_COMPRESS(hdr, ZIO_COMPRESS_OFF);
6338 hdr->b_l2hdr.b_asize = hdr->b_size;
6339 hdr->b_l2hdr.b_hits = 0;
6340 hdr->b_l1hdr.b_tmp_cdata = hdr->b_l1hdr.b_buf->b_data;
3a17a7a9 6341
d962d5da
PS
6342 /*
6343 * Explicitly set the b_daddr field to a known
6344 * value which means "invalid address". This
6345 * enables us to differentiate which stage of
6346 * l2arc_write_buffers() the particular header
6347 * is in (e.g. this loop, or the one below).
6348 * ARC_FLAG_L2_WRITING is not enough to make
6349 * this distinction, and we need to know in
6350 * order to do proper l2arc vdev accounting in
6351 * arc_release() and arc_hdr_destroy().
6352 *
6353 * Note, we can't use a new flag to distinguish
6354 * the two stages because we don't hold the
6355 * header's hash_lock below, in the second stage
6356 * of this function. Thus, we can't simply
6357 * change the b_flags field to denote that the
6358 * IO has been sent. We can change the b_daddr
6359 * field of the L2 portion, though, since we'll
6360 * be holding the l2ad_mtx; which is why we're
6361 * using it to denote the header's state change.
6362 */
6363 hdr->b_l2hdr.b_daddr = L2ARC_ADDR_UNSET;
b9541d6b 6364 hdr->b_flags |= ARC_FLAG_HAS_L2HDR;
3a17a7a9 6365
ca0bf58d 6366 mutex_enter(&dev->l2ad_mtx);
b9541d6b 6367 list_insert_head(&dev->l2ad_buflist, hdr);
ca0bf58d 6368 mutex_exit(&dev->l2ad_mtx);
34dc7c2f
BB
6369
6370 /*
6371 * Compute and store the buffer cksum before
6372 * writing. On debug the cksum is verified first.
6373 */
b9541d6b
CW
6374 arc_cksum_verify(hdr->b_l1hdr.b_buf);
6375 arc_cksum_compute(hdr->b_l1hdr.b_buf, B_TRUE);
34dc7c2f
BB
6376
6377 mutex_exit(hash_lock);
6378
3a17a7a9 6379 write_sz += buf_sz;
ef56b078 6380 write_asize += buf_a_sz;
3a17a7a9
SK
6381 }
6382
ca0bf58d 6383 multilist_sublist_unlock(mls);
3a17a7a9
SK
6384
6385 if (full == B_TRUE)
6386 break;
6387 }
6388
6389 /* No buffers selected for writing? */
6390 if (pio == NULL) {
6391 ASSERT0(write_sz);
b9541d6b
CW
6392 ASSERT(!HDR_HAS_L1HDR(head));
6393 kmem_cache_free(hdr_l2only_cache, head);
3a17a7a9
SK
6394 return (0);
6395 }
6396
ca0bf58d
PS
6397 mutex_enter(&dev->l2ad_mtx);
6398
ef56b078
AG
6399 /*
6400 * Note that elsewhere in this file arcstat_l2_asize
6401 * and the used space on l2ad_vdev are updated using b_asize,
6402 * which is not necessarily rounded up to the device block size.
6403 * Too keep accounting consistent we do the same here as well:
6404 * stats_size accumulates the sum of b_asize of the written buffers,
6405 * while write_asize accumulates the sum of b_asize rounded up
6406 * to the device block size.
6407 * The latter sum is used only to validate the corectness of the code.
6408 */
6409 stats_size = 0;
6410 write_asize = 0;
6411
3a17a7a9
SK
6412 /*
6413 * Now start writing the buffers. We're starting at the write head
6414 * and work backwards, retracing the course of the buffer selector
6415 * loop above.
6416 */
b9541d6b
CW
6417 for (hdr = list_prev(&dev->l2ad_buflist, head); hdr;
6418 hdr = list_prev(&dev->l2ad_buflist, hdr)) {
3a17a7a9
SK
6419 uint64_t buf_sz;
6420
ca0bf58d
PS
6421 /*
6422 * We rely on the L1 portion of the header below, so
6423 * it's invalid for this header to have been evicted out
6424 * of the ghost cache, prior to being written out. The
6425 * ARC_FLAG_L2_WRITING bit ensures this won't happen.
6426 */
6427 ASSERT(HDR_HAS_L1HDR(hdr));
6428
3a17a7a9
SK
6429 /*
6430 * We shouldn't need to lock the buffer here, since we flagged
2a432414
GW
6431 * it as ARC_FLAG_L2_WRITING in the previous step, but we must
6432 * take care to only access its L2 cache parameters. In
b9541d6b 6433 * particular, hdr->l1hdr.b_buf may be invalid by now due to
2a432414 6434 * ARC eviction.
3a17a7a9 6435 */
b9541d6b 6436 hdr->b_l2hdr.b_daddr = dev->l2ad_hand;
3a17a7a9 6437
b9541d6b
CW
6438 if ((!l2arc_nocompress && HDR_L2COMPRESS(hdr)) &&
6439 hdr->b_l2hdr.b_asize >= buf_compress_minsz) {
6440 if (l2arc_compress_buf(hdr)) {
3a17a7a9
SK
6441 /*
6442 * If compression succeeded, enable headroom
6443 * boost on the next scan cycle.
6444 */
6445 *headroom_boost = B_TRUE;
6446 }
6447 }
6448
6449 /*
6450 * Pick up the buffer data we had previously stashed away
6451 * (and now potentially also compressed).
6452 */
b9541d6b
CW
6453 buf_data = hdr->b_l1hdr.b_tmp_cdata;
6454 buf_sz = hdr->b_l2hdr.b_asize;
3a17a7a9 6455
d962d5da
PS
6456 /*
6457 * We need to do this regardless if buf_sz is zero or
6458 * not, otherwise, when this l2hdr is evicted we'll
6459 * remove a reference that was never added.
6460 */
6461 (void) refcount_add_many(&dev->l2ad_alloc, buf_sz, hdr);
6462
3a17a7a9
SK
6463 /* Compression may have squashed the buffer to zero length. */
6464 if (buf_sz != 0) {
ef56b078 6465 uint64_t buf_a_sz;
3a17a7a9 6466
34dc7c2f
BB
6467 wzio = zio_write_phys(pio, dev->l2ad_vdev,
6468 dev->l2ad_hand, buf_sz, buf_data, ZIO_CHECKSUM_OFF,
6469 NULL, NULL, ZIO_PRIORITY_ASYNC_WRITE,
6470 ZIO_FLAG_CANFAIL, B_FALSE);
6471
6472 DTRACE_PROBE2(l2arc__write, vdev_t *, dev->l2ad_vdev,
6473 zio_t *, wzio);
6474 (void) zio_nowait(wzio);
6475
ef56b078 6476 stats_size += buf_sz;
d962d5da 6477
b128c09f
BB
6478 /*
6479 * Keep the clock hand suitably device-aligned.
6480 */
ef56b078
AG
6481 buf_a_sz = vdev_psize_to_asize(dev->l2ad_vdev, buf_sz);
6482 write_asize += buf_a_sz;
6483 dev->l2ad_hand += buf_a_sz;
34dc7c2f 6484 }
34dc7c2f 6485 }
34dc7c2f 6486
b9541d6b 6487 mutex_exit(&dev->l2ad_mtx);
34dc7c2f 6488
3a17a7a9 6489 ASSERT3U(write_asize, <=, target_sz);
34dc7c2f 6490 ARCSTAT_BUMP(arcstat_l2_writes_sent);
3a17a7a9 6491 ARCSTAT_INCR(arcstat_l2_write_bytes, write_asize);
34dc7c2f 6492 ARCSTAT_INCR(arcstat_l2_size, write_sz);
ef56b078
AG
6493 ARCSTAT_INCR(arcstat_l2_asize, stats_size);
6494 vdev_space_update(dev->l2ad_vdev, stats_size, 0, 0);
34dc7c2f
BB
6495
6496 /*
6497 * Bump device hand to the device start if it is approaching the end.
6498 * l2arc_evict() will already have evicted ahead for this case.
6499 */
b128c09f 6500 if (dev->l2ad_hand >= (dev->l2ad_end - target_sz)) {
34dc7c2f 6501 dev->l2ad_hand = dev->l2ad_start;
34dc7c2f
BB
6502 dev->l2ad_first = B_FALSE;
6503 }
6504
d164b209 6505 dev->l2ad_writing = B_TRUE;
34dc7c2f 6506 (void) zio_wait(pio);
d164b209
BB
6507 dev->l2ad_writing = B_FALSE;
6508
3a17a7a9
SK
6509 return (write_asize);
6510}
6511
6512/*
6513 * Compresses an L2ARC buffer.
b9541d6b 6514 * The data to be compressed must be prefilled in l1hdr.b_tmp_cdata and its
3a17a7a9
SK
6515 * size in l2hdr->b_asize. This routine tries to compress the data and
6516 * depending on the compression result there are three possible outcomes:
6517 * *) The buffer was incompressible. The original l2hdr contents were left
6518 * untouched and are ready for writing to an L2 device.
6519 * *) The buffer was all-zeros, so there is no need to write it to an L2
6520 * device. To indicate this situation b_tmp_cdata is NULL'ed, b_asize is
6521 * set to zero and b_compress is set to ZIO_COMPRESS_EMPTY.
6522 * *) Compression succeeded and b_tmp_cdata was replaced with a temporary
6523 * data buffer which holds the compressed data to be written, and b_asize
6524 * tells us how much data there is. b_compress is set to the appropriate
6525 * compression algorithm. Once writing is done, invoke
6526 * l2arc_release_cdata_buf on this l2hdr to free this temporary buffer.
6527 *
6528 * Returns B_TRUE if compression succeeded, or B_FALSE if it didn't (the
6529 * buffer was incompressible).
6530 */
6531static boolean_t
b9541d6b 6532l2arc_compress_buf(arc_buf_hdr_t *hdr)
3a17a7a9
SK
6533{
6534 void *cdata;
9b67f605 6535 size_t csize, len, rounded;
b9541d6b 6536 l2arc_buf_hdr_t *l2hdr;
3a17a7a9 6537
b9541d6b
CW
6538 ASSERT(HDR_HAS_L2HDR(hdr));
6539
6540 l2hdr = &hdr->b_l2hdr;
6541
6542 ASSERT(HDR_HAS_L1HDR(hdr));
6543 ASSERT(HDR_GET_COMPRESS(hdr) == ZIO_COMPRESS_OFF);
6544 ASSERT(hdr->b_l1hdr.b_tmp_cdata != NULL);
3a17a7a9
SK
6545
6546 len = l2hdr->b_asize;
6547 cdata = zio_data_buf_alloc(len);
b9541d6b
CW
6548 ASSERT3P(cdata, !=, NULL);
6549 csize = zio_compress_data(ZIO_COMPRESS_LZ4, hdr->b_l1hdr.b_tmp_cdata,
3a17a7a9
SK
6550 cdata, l2hdr->b_asize);
6551
9b67f605
MA
6552 rounded = P2ROUNDUP(csize, (size_t)SPA_MINBLOCKSIZE);
6553 if (rounded > csize) {
6554 bzero((char *)cdata + csize, rounded - csize);
6555 csize = rounded;
6556 }
6557
3a17a7a9
SK
6558 if (csize == 0) {
6559 /* zero block, indicate that there's nothing to write */
6560 zio_data_buf_free(cdata, len);
b9541d6b 6561 HDR_SET_COMPRESS(hdr, ZIO_COMPRESS_EMPTY);
3a17a7a9 6562 l2hdr->b_asize = 0;
b9541d6b 6563 hdr->b_l1hdr.b_tmp_cdata = NULL;
3a17a7a9
SK
6564 ARCSTAT_BUMP(arcstat_l2_compress_zeros);
6565 return (B_TRUE);
6566 } else if (csize > 0 && csize < len) {
6567 /*
6568 * Compression succeeded, we'll keep the cdata around for
6569 * writing and release it afterwards.
6570 */
b9541d6b 6571 HDR_SET_COMPRESS(hdr, ZIO_COMPRESS_LZ4);
3a17a7a9 6572 l2hdr->b_asize = csize;
b9541d6b 6573 hdr->b_l1hdr.b_tmp_cdata = cdata;
3a17a7a9
SK
6574 ARCSTAT_BUMP(arcstat_l2_compress_successes);
6575 return (B_TRUE);
6576 } else {
6577 /*
6578 * Compression failed, release the compressed buffer.
6579 * l2hdr will be left unmodified.
6580 */
6581 zio_data_buf_free(cdata, len);
6582 ARCSTAT_BUMP(arcstat_l2_compress_failures);
6583 return (B_FALSE);
6584 }
6585}
6586
6587/*
6588 * Decompresses a zio read back from an l2arc device. On success, the
6589 * underlying zio's io_data buffer is overwritten by the uncompressed
6590 * version. On decompression error (corrupt compressed stream), the
6591 * zio->io_error value is set to signal an I/O error.
6592 *
6593 * Please note that the compressed data stream is not checksummed, so
6594 * if the underlying device is experiencing data corruption, we may feed
6595 * corrupt data to the decompressor, so the decompressor needs to be
6596 * able to handle this situation (LZ4 does).
6597 */
6598static void
6599l2arc_decompress_zio(zio_t *zio, arc_buf_hdr_t *hdr, enum zio_compress c)
6600{
6601 uint64_t csize;
6602 void *cdata;
6603
6604 ASSERT(L2ARC_IS_VALID_COMPRESS(c));
6605
6606 if (zio->io_error != 0) {
6607 /*
6608 * An io error has occured, just restore the original io
6609 * size in preparation for a main pool read.
6610 */
6611 zio->io_orig_size = zio->io_size = hdr->b_size;
6612 return;
6613 }
6614
6615 if (c == ZIO_COMPRESS_EMPTY) {
6616 /*
6617 * An empty buffer results in a null zio, which means we
6618 * need to fill its io_data after we're done restoring the
6619 * buffer's contents.
6620 */
b9541d6b
CW
6621 ASSERT(hdr->b_l1hdr.b_buf != NULL);
6622 bzero(hdr->b_l1hdr.b_buf->b_data, hdr->b_size);
6623 zio->io_data = zio->io_orig_data = hdr->b_l1hdr.b_buf->b_data;
3a17a7a9
SK
6624 } else {
6625 ASSERT(zio->io_data != NULL);
6626 /*
6627 * We copy the compressed data from the start of the arc buffer
6628 * (the zio_read will have pulled in only what we need, the
6629 * rest is garbage which we will overwrite at decompression)
6630 * and then decompress back to the ARC data buffer. This way we
6631 * can minimize copying by simply decompressing back over the
6632 * original compressed data (rather than decompressing to an
6633 * aux buffer and then copying back the uncompressed buffer,
6634 * which is likely to be much larger).
6635 */
6636 csize = zio->io_size;
6637 cdata = zio_data_buf_alloc(csize);
6638 bcopy(zio->io_data, cdata, csize);
6639 if (zio_decompress_data(c, cdata, zio->io_data, csize,
6640 hdr->b_size) != 0)
2e528b49 6641 zio->io_error = SET_ERROR(EIO);
3a17a7a9
SK
6642 zio_data_buf_free(cdata, csize);
6643 }
6644
6645 /* Restore the expected uncompressed IO size. */
6646 zio->io_orig_size = zio->io_size = hdr->b_size;
6647}
6648
6649/*
6650 * Releases the temporary b_tmp_cdata buffer in an l2arc header structure.
6651 * This buffer serves as a temporary holder of compressed data while
6652 * the buffer entry is being written to an l2arc device. Once that is
6653 * done, we can dispose of it.
6654 */
6655static void
2a432414 6656l2arc_release_cdata_buf(arc_buf_hdr_t *hdr)
3a17a7a9 6657{
ca0bf58d
PS
6658 enum zio_compress comp = HDR_GET_COMPRESS(hdr);
6659
b9541d6b 6660 ASSERT(HDR_HAS_L1HDR(hdr));
ca0bf58d
PS
6661 ASSERT(comp == ZIO_COMPRESS_OFF || L2ARC_IS_VALID_COMPRESS(comp));
6662
6663 if (comp == ZIO_COMPRESS_OFF) {
6664 /*
6665 * In this case, b_tmp_cdata points to the same buffer
6666 * as the arc_buf_t's b_data field. We don't want to
6667 * free it, since the arc_buf_t will handle that.
6668 */
6669 hdr->b_l1hdr.b_tmp_cdata = NULL;
6670 } else if (comp == ZIO_COMPRESS_EMPTY) {
6671 /*
6672 * In this case, b_tmp_cdata was compressed to an empty
6673 * buffer, thus there's nothing to free and b_tmp_cdata
6674 * should have been set to NULL in l2arc_write_buffers().
6675 */
6676 ASSERT3P(hdr->b_l1hdr.b_tmp_cdata, ==, NULL);
6677 } else {
3a17a7a9
SK
6678 /*
6679 * If the data was compressed, then we've allocated a
6680 * temporary buffer for it, so now we need to release it.
6681 */
b9541d6b
CW
6682 ASSERT(hdr->b_l1hdr.b_tmp_cdata != NULL);
6683 zio_data_buf_free(hdr->b_l1hdr.b_tmp_cdata,
6684 hdr->b_size);
ca0bf58d 6685 hdr->b_l1hdr.b_tmp_cdata = NULL;
3a17a7a9 6686 }
ca0bf58d 6687
34dc7c2f
BB
6688}
6689
6690/*
6691 * This thread feeds the L2ARC at regular intervals. This is the beating
6692 * heart of the L2ARC.
6693 */
6694static void
6695l2arc_feed_thread(void)
6696{
6697 callb_cpr_t cpr;
6698 l2arc_dev_t *dev;
6699 spa_t *spa;
d164b209 6700 uint64_t size, wrote;
428870ff 6701 clock_t begin, next = ddi_get_lbolt();
3a17a7a9 6702 boolean_t headroom_boost = B_FALSE;
40d06e3c 6703 fstrans_cookie_t cookie;
34dc7c2f
BB
6704
6705 CALLB_CPR_INIT(&cpr, &l2arc_feed_thr_lock, callb_generic_cpr, FTAG);
6706
6707 mutex_enter(&l2arc_feed_thr_lock);
6708
40d06e3c 6709 cookie = spl_fstrans_mark();
34dc7c2f 6710 while (l2arc_thread_exit == 0) {
34dc7c2f 6711 CALLB_CPR_SAFE_BEGIN(&cpr);
b64ccd6c 6712 (void) cv_timedwait_sig(&l2arc_feed_thr_cv,
5b63b3eb 6713 &l2arc_feed_thr_lock, next);
34dc7c2f 6714 CALLB_CPR_SAFE_END(&cpr, &l2arc_feed_thr_lock);
428870ff 6715 next = ddi_get_lbolt() + hz;
34dc7c2f
BB
6716
6717 /*
b128c09f 6718 * Quick check for L2ARC devices.
34dc7c2f
BB
6719 */
6720 mutex_enter(&l2arc_dev_mtx);
6721 if (l2arc_ndev == 0) {
6722 mutex_exit(&l2arc_dev_mtx);
6723 continue;
6724 }
b128c09f 6725 mutex_exit(&l2arc_dev_mtx);
428870ff 6726 begin = ddi_get_lbolt();
34dc7c2f
BB
6727
6728 /*
b128c09f
BB
6729 * This selects the next l2arc device to write to, and in
6730 * doing so the next spa to feed from: dev->l2ad_spa. This
6731 * will return NULL if there are now no l2arc devices or if
6732 * they are all faulted.
6733 *
6734 * If a device is returned, its spa's config lock is also
6735 * held to prevent device removal. l2arc_dev_get_next()
6736 * will grab and release l2arc_dev_mtx.
34dc7c2f 6737 */
b128c09f 6738 if ((dev = l2arc_dev_get_next()) == NULL)
34dc7c2f 6739 continue;
b128c09f
BB
6740
6741 spa = dev->l2ad_spa;
6742 ASSERT(spa != NULL);
34dc7c2f 6743
572e2857
BB
6744 /*
6745 * If the pool is read-only then force the feed thread to
6746 * sleep a little longer.
6747 */
6748 if (!spa_writeable(spa)) {
6749 next = ddi_get_lbolt() + 5 * l2arc_feed_secs * hz;
6750 spa_config_exit(spa, SCL_L2ARC, dev);
6751 continue;
6752 }
6753
34dc7c2f 6754 /*
b128c09f 6755 * Avoid contributing to memory pressure.
34dc7c2f 6756 */
ca67b33a 6757 if (arc_reclaim_needed()) {
b128c09f
BB
6758 ARCSTAT_BUMP(arcstat_l2_abort_lowmem);
6759 spa_config_exit(spa, SCL_L2ARC, dev);
34dc7c2f
BB
6760 continue;
6761 }
b128c09f 6762
34dc7c2f
BB
6763 ARCSTAT_BUMP(arcstat_l2_feeds);
6764
3a17a7a9 6765 size = l2arc_write_size();
b128c09f 6766
34dc7c2f
BB
6767 /*
6768 * Evict L2ARC buffers that will be overwritten.
6769 */
b128c09f 6770 l2arc_evict(dev, size, B_FALSE);
34dc7c2f
BB
6771
6772 /*
6773 * Write ARC buffers.
6774 */
3a17a7a9 6775 wrote = l2arc_write_buffers(spa, dev, size, &headroom_boost);
d164b209
BB
6776
6777 /*
6778 * Calculate interval between writes.
6779 */
6780 next = l2arc_write_interval(begin, size, wrote);
b128c09f 6781 spa_config_exit(spa, SCL_L2ARC, dev);
34dc7c2f 6782 }
40d06e3c 6783 spl_fstrans_unmark(cookie);
34dc7c2f
BB
6784
6785 l2arc_thread_exit = 0;
6786 cv_broadcast(&l2arc_feed_thr_cv);
6787 CALLB_CPR_EXIT(&cpr); /* drops l2arc_feed_thr_lock */
6788 thread_exit();
6789}
6790
b128c09f
BB
6791boolean_t
6792l2arc_vdev_present(vdev_t *vd)
6793{
6794 l2arc_dev_t *dev;
6795
6796 mutex_enter(&l2arc_dev_mtx);
6797 for (dev = list_head(l2arc_dev_list); dev != NULL;
6798 dev = list_next(l2arc_dev_list, dev)) {
6799 if (dev->l2ad_vdev == vd)
6800 break;
6801 }
6802 mutex_exit(&l2arc_dev_mtx);
6803
6804 return (dev != NULL);
6805}
6806
34dc7c2f
BB
6807/*
6808 * Add a vdev for use by the L2ARC. By this point the spa has already
6809 * validated the vdev and opened it.
6810 */
6811void
9babb374 6812l2arc_add_vdev(spa_t *spa, vdev_t *vd)
34dc7c2f
BB
6813{
6814 l2arc_dev_t *adddev;
6815
b128c09f
BB
6816 ASSERT(!l2arc_vdev_present(vd));
6817
34dc7c2f
BB
6818 /*
6819 * Create a new l2arc device entry.
6820 */
6821 adddev = kmem_zalloc(sizeof (l2arc_dev_t), KM_SLEEP);
6822 adddev->l2ad_spa = spa;
6823 adddev->l2ad_vdev = vd;
9babb374
BB
6824 adddev->l2ad_start = VDEV_LABEL_START_SIZE;
6825 adddev->l2ad_end = VDEV_LABEL_START_SIZE + vdev_get_min_asize(vd);
34dc7c2f 6826 adddev->l2ad_hand = adddev->l2ad_start;
34dc7c2f 6827 adddev->l2ad_first = B_TRUE;
d164b209 6828 adddev->l2ad_writing = B_FALSE;
98f72a53 6829 list_link_init(&adddev->l2ad_node);
34dc7c2f 6830
b9541d6b 6831 mutex_init(&adddev->l2ad_mtx, NULL, MUTEX_DEFAULT, NULL);
34dc7c2f
BB
6832 /*
6833 * This is a list of all ARC buffers that are still valid on the
6834 * device.
6835 */
b9541d6b
CW
6836 list_create(&adddev->l2ad_buflist, sizeof (arc_buf_hdr_t),
6837 offsetof(arc_buf_hdr_t, b_l2hdr.b_l2node));
34dc7c2f 6838
428870ff 6839 vdev_space_update(vd, 0, 0, adddev->l2ad_end - adddev->l2ad_hand);
d962d5da 6840 refcount_create(&adddev->l2ad_alloc);
34dc7c2f
BB
6841
6842 /*
6843 * Add device to global list
6844 */
6845 mutex_enter(&l2arc_dev_mtx);
6846 list_insert_head(l2arc_dev_list, adddev);
6847 atomic_inc_64(&l2arc_ndev);
6848 mutex_exit(&l2arc_dev_mtx);
6849}
6850
6851/*
6852 * Remove a vdev from the L2ARC.
6853 */
6854void
6855l2arc_remove_vdev(vdev_t *vd)
6856{
6857 l2arc_dev_t *dev, *nextdev, *remdev = NULL;
6858
34dc7c2f
BB
6859 /*
6860 * Find the device by vdev
6861 */
6862 mutex_enter(&l2arc_dev_mtx);
6863 for (dev = list_head(l2arc_dev_list); dev; dev = nextdev) {
6864 nextdev = list_next(l2arc_dev_list, dev);
6865 if (vd == dev->l2ad_vdev) {
6866 remdev = dev;
6867 break;
6868 }
6869 }
6870 ASSERT(remdev != NULL);
6871
6872 /*
6873 * Remove device from global list
6874 */
6875 list_remove(l2arc_dev_list, remdev);
6876 l2arc_dev_last = NULL; /* may have been invalidated */
b128c09f
BB
6877 atomic_dec_64(&l2arc_ndev);
6878 mutex_exit(&l2arc_dev_mtx);
34dc7c2f
BB
6879
6880 /*
6881 * Clear all buflists and ARC references. L2ARC device flush.
6882 */
6883 l2arc_evict(remdev, 0, B_TRUE);
b9541d6b
CW
6884 list_destroy(&remdev->l2ad_buflist);
6885 mutex_destroy(&remdev->l2ad_mtx);
d962d5da 6886 refcount_destroy(&remdev->l2ad_alloc);
34dc7c2f 6887 kmem_free(remdev, sizeof (l2arc_dev_t));
34dc7c2f
BB
6888}
6889
6890void
b128c09f 6891l2arc_init(void)
34dc7c2f
BB
6892{
6893 l2arc_thread_exit = 0;
6894 l2arc_ndev = 0;
6895 l2arc_writes_sent = 0;
6896 l2arc_writes_done = 0;
6897
6898 mutex_init(&l2arc_feed_thr_lock, NULL, MUTEX_DEFAULT, NULL);
6899 cv_init(&l2arc_feed_thr_cv, NULL, CV_DEFAULT, NULL);
6900 mutex_init(&l2arc_dev_mtx, NULL, MUTEX_DEFAULT, NULL);
34dc7c2f
BB
6901 mutex_init(&l2arc_free_on_write_mtx, NULL, MUTEX_DEFAULT, NULL);
6902
6903 l2arc_dev_list = &L2ARC_dev_list;
6904 l2arc_free_on_write = &L2ARC_free_on_write;
6905 list_create(l2arc_dev_list, sizeof (l2arc_dev_t),
6906 offsetof(l2arc_dev_t, l2ad_node));
6907 list_create(l2arc_free_on_write, sizeof (l2arc_data_free_t),
6908 offsetof(l2arc_data_free_t, l2df_list_node));
34dc7c2f
BB
6909}
6910
6911void
b128c09f 6912l2arc_fini(void)
34dc7c2f 6913{
b128c09f
BB
6914 /*
6915 * This is called from dmu_fini(), which is called from spa_fini();
6916 * Because of this, we can assume that all l2arc devices have
6917 * already been removed when the pools themselves were removed.
6918 */
6919
6920 l2arc_do_free_on_write();
34dc7c2f
BB
6921
6922 mutex_destroy(&l2arc_feed_thr_lock);
6923 cv_destroy(&l2arc_feed_thr_cv);
6924 mutex_destroy(&l2arc_dev_mtx);
34dc7c2f
BB
6925 mutex_destroy(&l2arc_free_on_write_mtx);
6926
6927 list_destroy(l2arc_dev_list);
6928 list_destroy(l2arc_free_on_write);
6929}
b128c09f
BB
6930
6931void
6932l2arc_start(void)
6933{
fb5f0bc8 6934 if (!(spa_mode_global & FWRITE))
b128c09f
BB
6935 return;
6936
6937 (void) thread_create(NULL, 0, l2arc_feed_thread, NULL, 0, &p0,
6938 TS_RUN, minclsyspri);
6939}
6940
6941void
6942l2arc_stop(void)
6943{
fb5f0bc8 6944 if (!(spa_mode_global & FWRITE))
b128c09f
BB
6945 return;
6946
6947 mutex_enter(&l2arc_feed_thr_lock);
6948 cv_signal(&l2arc_feed_thr_cv); /* kick thread out of startup */
6949 l2arc_thread_exit = 1;
6950 while (l2arc_thread_exit != 0)
6951 cv_wait(&l2arc_feed_thr_cv, &l2arc_feed_thr_lock);
6952 mutex_exit(&l2arc_feed_thr_lock);
6953}
c28b2279
BB
6954
6955#if defined(_KERNEL) && defined(HAVE_SPL)
0f699108
AZ
6956EXPORT_SYMBOL(arc_buf_size);
6957EXPORT_SYMBOL(arc_write);
c28b2279
BB
6958EXPORT_SYMBOL(arc_read);
6959EXPORT_SYMBOL(arc_buf_remove_ref);
e0b0ca98 6960EXPORT_SYMBOL(arc_buf_info);
c28b2279 6961EXPORT_SYMBOL(arc_getbuf_func);
ab26409d
BB
6962EXPORT_SYMBOL(arc_add_prune_callback);
6963EXPORT_SYMBOL(arc_remove_prune_callback);
c28b2279 6964
bce45ec9 6965module_param(zfs_arc_min, ulong, 0644);
c409e464 6966MODULE_PARM_DESC(zfs_arc_min, "Min arc size");
c28b2279 6967
bce45ec9 6968module_param(zfs_arc_max, ulong, 0644);
c409e464 6969MODULE_PARM_DESC(zfs_arc_max, "Max arc size");
c28b2279 6970
bce45ec9 6971module_param(zfs_arc_meta_limit, ulong, 0644);
c28b2279 6972MODULE_PARM_DESC(zfs_arc_meta_limit, "Meta limit for arc size");
6a8f9b6b 6973
ca0bf58d
PS
6974module_param(zfs_arc_meta_min, ulong, 0644);
6975MODULE_PARM_DESC(zfs_arc_meta_min, "Min arc metadata");
6976
bce45ec9 6977module_param(zfs_arc_meta_prune, int, 0644);
2cbb06b5 6978MODULE_PARM_DESC(zfs_arc_meta_prune, "Meta objects to scan for prune");
c409e464 6979
ca67b33a 6980module_param(zfs_arc_meta_adjust_restarts, int, 0644);
bc888666
BB
6981MODULE_PARM_DESC(zfs_arc_meta_adjust_restarts,
6982 "Limit number of restarts in arc_adjust_meta");
6983
f6046738
BB
6984module_param(zfs_arc_meta_strategy, int, 0644);
6985MODULE_PARM_DESC(zfs_arc_meta_strategy, "Meta reclaim strategy");
6986
bce45ec9 6987module_param(zfs_arc_grow_retry, int, 0644);
c409e464
BB
6988MODULE_PARM_DESC(zfs_arc_grow_retry, "Seconds before growing arc size");
6989
89c8cac4
PS
6990module_param(zfs_arc_p_aggressive_disable, int, 0644);
6991MODULE_PARM_DESC(zfs_arc_p_aggressive_disable, "disable aggressive arc_p grow");
6992
62422785
PS
6993module_param(zfs_arc_p_dampener_disable, int, 0644);
6994MODULE_PARM_DESC(zfs_arc_p_dampener_disable, "disable arc_p adapt dampener");
6995
bce45ec9 6996module_param(zfs_arc_shrink_shift, int, 0644);
c409e464
BB
6997MODULE_PARM_DESC(zfs_arc_shrink_shift, "log2(fraction of arc to reclaim)");
6998
1f7c30df
BB
6999module_param(zfs_disable_dup_eviction, int, 0644);
7000MODULE_PARM_DESC(zfs_disable_dup_eviction, "disable duplicate buffer eviction");
7001
49ddb315
MA
7002module_param(zfs_arc_average_blocksize, int, 0444);
7003MODULE_PARM_DESC(zfs_arc_average_blocksize, "Target average block size");
7004
0c5493d4
BB
7005module_param(zfs_arc_memory_throttle_disable, int, 0644);
7006MODULE_PARM_DESC(zfs_arc_memory_throttle_disable, "disable memory throttle");
7007
bce45ec9
BB
7008module_param(zfs_arc_min_prefetch_lifespan, int, 0644);
7009MODULE_PARM_DESC(zfs_arc_min_prefetch_lifespan, "Min life of prefetch block");
7010
ca0bf58d
PS
7011module_param(zfs_arc_num_sublists_per_state, int, 0644);
7012MODULE_PARM_DESC(zfs_arc_num_sublists_per_state,
7013 "Number of sublists used in each of the ARC state lists");
7014
bce45ec9 7015module_param(l2arc_write_max, ulong, 0644);
abd8610c
BB
7016MODULE_PARM_DESC(l2arc_write_max, "Max write bytes per interval");
7017
bce45ec9 7018module_param(l2arc_write_boost, ulong, 0644);
abd8610c
BB
7019MODULE_PARM_DESC(l2arc_write_boost, "Extra write bytes during device warmup");
7020
bce45ec9 7021module_param(l2arc_headroom, ulong, 0644);
abd8610c
BB
7022MODULE_PARM_DESC(l2arc_headroom, "Number of max device writes to precache");
7023
3a17a7a9
SK
7024module_param(l2arc_headroom_boost, ulong, 0644);
7025MODULE_PARM_DESC(l2arc_headroom_boost, "Compressed l2arc_headroom multiplier");
7026
bce45ec9 7027module_param(l2arc_feed_secs, ulong, 0644);
abd8610c
BB
7028MODULE_PARM_DESC(l2arc_feed_secs, "Seconds between L2ARC writing");
7029
bce45ec9 7030module_param(l2arc_feed_min_ms, ulong, 0644);
abd8610c
BB
7031MODULE_PARM_DESC(l2arc_feed_min_ms, "Min feed interval in milliseconds");
7032
bce45ec9 7033module_param(l2arc_noprefetch, int, 0644);
abd8610c
BB
7034MODULE_PARM_DESC(l2arc_noprefetch, "Skip caching prefetched buffers");
7035
3a17a7a9
SK
7036module_param(l2arc_nocompress, int, 0644);
7037MODULE_PARM_DESC(l2arc_nocompress, "Skip compressing L2ARC buffers");
7038
bce45ec9 7039module_param(l2arc_feed_again, int, 0644);
abd8610c
BB
7040MODULE_PARM_DESC(l2arc_feed_again, "Turbo L2ARC warmup");
7041
bce45ec9 7042module_param(l2arc_norw, int, 0644);
abd8610c
BB
7043MODULE_PARM_DESC(l2arc_norw, "No reads during writes");
7044
c28b2279 7045#endif