]> git.proxmox.com Git - mirror_zfs.git/blame - module/zfs/arc.c
Fix small sysfs leak
[mirror_zfs.git] / module / zfs / arc.c
CommitLineData
34dc7c2f
BB
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21/*
428870ff 22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
36da08ef 23 * Copyright (c) 2012, Joyent, Inc. All rights reserved.
c3bd3fb4 24 * Copyright (c) 2011, 2018 by Delphix. All rights reserved.
36da08ef 25 * Copyright (c) 2014 by Saso Kiselkov. All rights reserved.
d3c2ae1c 26 * Copyright 2015 Nexenta Systems, Inc. All rights reserved.
34dc7c2f
BB
27 */
28
34dc7c2f
BB
29/*
30 * DVA-based Adjustable Replacement Cache
31 *
32 * While much of the theory of operation used here is
33 * based on the self-tuning, low overhead replacement cache
34 * presented by Megiddo and Modha at FAST 2003, there are some
35 * significant differences:
36 *
37 * 1. The Megiddo and Modha model assumes any page is evictable.
38 * Pages in its cache cannot be "locked" into memory. This makes
39 * the eviction algorithm simple: evict the last page in the list.
40 * This also make the performance characteristics easy to reason
41 * about. Our cache is not so simple. At any given moment, some
42 * subset of the blocks in the cache are un-evictable because we
43 * have handed out a reference to them. Blocks are only evictable
44 * when there are no external references active. This makes
45 * eviction far more problematic: we choose to evict the evictable
46 * blocks that are the "lowest" in the list.
47 *
48 * There are times when it is not possible to evict the requested
49 * space. In these circumstances we are unable to adjust the cache
50 * size. To prevent the cache growing unbounded at these times we
51 * implement a "cache throttle" that slows the flow of new data
52 * into the cache until we can make space available.
53 *
54 * 2. The Megiddo and Modha model assumes a fixed cache size.
55 * Pages are evicted when the cache is full and there is a cache
56 * miss. Our model has a variable sized cache. It grows with
57 * high use, but also tries to react to memory pressure from the
58 * operating system: decreasing its size when system memory is
59 * tight.
60 *
61 * 3. The Megiddo and Modha model assumes a fixed page size. All
d3cc8b15 62 * elements of the cache are therefore exactly the same size. So
34dc7c2f
BB
63 * when adjusting the cache size following a cache miss, its simply
64 * a matter of choosing a single page to evict. In our model, we
65 * have variable sized cache blocks (rangeing from 512 bytes to
d3cc8b15 66 * 128K bytes). We therefore choose a set of blocks to evict to make
34dc7c2f
BB
67 * space for a cache miss that approximates as closely as possible
68 * the space used by the new block.
69 *
70 * See also: "ARC: A Self-Tuning, Low Overhead Replacement Cache"
71 * by N. Megiddo & D. Modha, FAST 2003
72 */
73
74/*
75 * The locking model:
76 *
77 * A new reference to a cache buffer can be obtained in two
78 * ways: 1) via a hash table lookup using the DVA as a key,
79 * or 2) via one of the ARC lists. The arc_read() interface
2aa34383 80 * uses method 1, while the internal ARC algorithms for
d3cc8b15 81 * adjusting the cache use method 2. We therefore provide two
34dc7c2f 82 * types of locks: 1) the hash table lock array, and 2) the
2aa34383 83 * ARC list locks.
34dc7c2f 84 *
5c839890
BC
85 * Buffers do not have their own mutexes, rather they rely on the
86 * hash table mutexes for the bulk of their protection (i.e. most
87 * fields in the arc_buf_hdr_t are protected by these mutexes).
34dc7c2f
BB
88 *
89 * buf_hash_find() returns the appropriate mutex (held) when it
90 * locates the requested buffer in the hash table. It returns
91 * NULL for the mutex if the buffer was not in the table.
92 *
93 * buf_hash_remove() expects the appropriate hash mutex to be
94 * already held before it is invoked.
95 *
2aa34383 96 * Each ARC state also has a mutex which is used to protect the
34dc7c2f 97 * buffer list associated with the state. When attempting to
2aa34383 98 * obtain a hash table lock while holding an ARC list lock you
34dc7c2f
BB
99 * must use: mutex_tryenter() to avoid deadlock. Also note that
100 * the active state mutex must be held before the ghost state mutex.
101 *
ab26409d
BB
102 * It as also possible to register a callback which is run when the
103 * arc_meta_limit is reached and no buffers can be safely evicted. In
104 * this case the arc user should drop a reference on some arc buffers so
105 * they can be reclaimed and the arc_meta_limit honored. For example,
106 * when using the ZPL each dentry holds a references on a znode. These
107 * dentries must be pruned before the arc buffer holding the znode can
108 * be safely evicted.
109 *
34dc7c2f
BB
110 * Note that the majority of the performance stats are manipulated
111 * with atomic operations.
112 *
b9541d6b 113 * The L2ARC uses the l2ad_mtx on each vdev for the following:
34dc7c2f
BB
114 *
115 * - L2ARC buflist creation
116 * - L2ARC buflist eviction
117 * - L2ARC write completion, which walks L2ARC buflists
118 * - ARC header destruction, as it removes from L2ARC buflists
119 * - ARC header release, as it removes from L2ARC buflists
120 */
121
d3c2ae1c
GW
122/*
123 * ARC operation:
124 *
125 * Every block that is in the ARC is tracked by an arc_buf_hdr_t structure.
126 * This structure can point either to a block that is still in the cache or to
127 * one that is only accessible in an L2 ARC device, or it can provide
128 * information about a block that was recently evicted. If a block is
129 * only accessible in the L2ARC, then the arc_buf_hdr_t only has enough
130 * information to retrieve it from the L2ARC device. This information is
131 * stored in the l2arc_buf_hdr_t sub-structure of the arc_buf_hdr_t. A block
132 * that is in this state cannot access the data directly.
133 *
134 * Blocks that are actively being referenced or have not been evicted
135 * are cached in the L1ARC. The L1ARC (l1arc_buf_hdr_t) is a structure within
136 * the arc_buf_hdr_t that will point to the data block in memory. A block can
137 * only be read by a consumer if it has an l1arc_buf_hdr_t. The L1ARC
2aa34383 138 * caches data in two ways -- in a list of ARC buffers (arc_buf_t) and
a6255b7f 139 * also in the arc_buf_hdr_t's private physical data block pointer (b_pabd).
2aa34383
DK
140 *
141 * The L1ARC's data pointer may or may not be uncompressed. The ARC has the
a6255b7f
DQ
142 * ability to store the physical data (b_pabd) associated with the DVA of the
143 * arc_buf_hdr_t. Since the b_pabd is a copy of the on-disk physical block,
2aa34383
DK
144 * it will match its on-disk compression characteristics. This behavior can be
145 * disabled by setting 'zfs_compressed_arc_enabled' to B_FALSE. When the
a6255b7f 146 * compressed ARC functionality is disabled, the b_pabd will point to an
2aa34383
DK
147 * uncompressed version of the on-disk data.
148 *
149 * Data in the L1ARC is not accessed by consumers of the ARC directly. Each
150 * arc_buf_hdr_t can have multiple ARC buffers (arc_buf_t) which reference it.
151 * Each ARC buffer (arc_buf_t) is being actively accessed by a specific ARC
152 * consumer. The ARC will provide references to this data and will keep it
153 * cached until it is no longer in use. The ARC caches only the L1ARC's physical
154 * data block and will evict any arc_buf_t that is no longer referenced. The
155 * amount of memory consumed by the arc_buf_ts' data buffers can be seen via the
d3c2ae1c
GW
156 * "overhead_size" kstat.
157 *
2aa34383
DK
158 * Depending on the consumer, an arc_buf_t can be requested in uncompressed or
159 * compressed form. The typical case is that consumers will want uncompressed
160 * data, and when that happens a new data buffer is allocated where the data is
161 * decompressed for them to use. Currently the only consumer who wants
162 * compressed arc_buf_t's is "zfs send", when it streams data exactly as it
163 * exists on disk. When this happens, the arc_buf_t's data buffer is shared
164 * with the arc_buf_hdr_t.
d3c2ae1c 165 *
2aa34383
DK
166 * Here is a diagram showing an arc_buf_hdr_t referenced by two arc_buf_t's. The
167 * first one is owned by a compressed send consumer (and therefore references
168 * the same compressed data buffer as the arc_buf_hdr_t) and the second could be
169 * used by any other consumer (and has its own uncompressed copy of the data
170 * buffer).
d3c2ae1c 171 *
2aa34383
DK
172 * arc_buf_hdr_t
173 * +-----------+
174 * | fields |
175 * | common to |
176 * | L1- and |
177 * | L2ARC |
178 * +-----------+
179 * | l2arc_buf_hdr_t
180 * | |
181 * +-----------+
182 * | l1arc_buf_hdr_t
183 * | | arc_buf_t
184 * | b_buf +------------>+-----------+ arc_buf_t
a6255b7f 185 * | b_pabd +-+ |b_next +---->+-----------+
2aa34383
DK
186 * +-----------+ | |-----------| |b_next +-->NULL
187 * | |b_comp = T | +-----------+
188 * | |b_data +-+ |b_comp = F |
189 * | +-----------+ | |b_data +-+
190 * +->+------+ | +-----------+ |
191 * compressed | | | |
192 * data | |<--------------+ | uncompressed
193 * +------+ compressed, | data
194 * shared +-->+------+
195 * data | |
196 * | |
197 * +------+
d3c2ae1c
GW
198 *
199 * When a consumer reads a block, the ARC must first look to see if the
2aa34383
DK
200 * arc_buf_hdr_t is cached. If the hdr is cached then the ARC allocates a new
201 * arc_buf_t and either copies uncompressed data into a new data buffer from an
a6255b7f
DQ
202 * existing uncompressed arc_buf_t, decompresses the hdr's b_pabd buffer into a
203 * new data buffer, or shares the hdr's b_pabd buffer, depending on whether the
2aa34383
DK
204 * hdr is compressed and the desired compression characteristics of the
205 * arc_buf_t consumer. If the arc_buf_t ends up sharing data with the
206 * arc_buf_hdr_t and both of them are uncompressed then the arc_buf_t must be
207 * the last buffer in the hdr's b_buf list, however a shared compressed buf can
208 * be anywhere in the hdr's list.
d3c2ae1c
GW
209 *
210 * The diagram below shows an example of an uncompressed ARC hdr that is
2aa34383
DK
211 * sharing its data with an arc_buf_t (note that the shared uncompressed buf is
212 * the last element in the buf list):
d3c2ae1c
GW
213 *
214 * arc_buf_hdr_t
215 * +-----------+
216 * | |
217 * | |
218 * | |
219 * +-----------+
220 * l2arc_buf_hdr_t| |
221 * | |
222 * +-----------+
223 * l1arc_buf_hdr_t| |
224 * | | arc_buf_t (shared)
225 * | b_buf +------------>+---------+ arc_buf_t
226 * | | |b_next +---->+---------+
a6255b7f 227 * | b_pabd +-+ |---------| |b_next +-->NULL
d3c2ae1c
GW
228 * +-----------+ | | | +---------+
229 * | |b_data +-+ | |
230 * | +---------+ | |b_data +-+
231 * +->+------+ | +---------+ |
232 * | | | |
233 * uncompressed | | | |
234 * data +------+ | |
235 * ^ +->+------+ |
236 * | uncompressed | | |
237 * | data | | |
238 * | +------+ |
239 * +---------------------------------+
240 *
a6255b7f 241 * Writing to the ARC requires that the ARC first discard the hdr's b_pabd
d3c2ae1c 242 * since the physical block is about to be rewritten. The new data contents
2aa34383
DK
243 * will be contained in the arc_buf_t. As the I/O pipeline performs the write,
244 * it may compress the data before writing it to disk. The ARC will be called
245 * with the transformed data and will bcopy the transformed on-disk block into
a6255b7f 246 * a newly allocated b_pabd. Writes are always done into buffers which have
2aa34383
DK
247 * either been loaned (and hence are new and don't have other readers) or
248 * buffers which have been released (and hence have their own hdr, if there
249 * were originally other readers of the buf's original hdr). This ensures that
250 * the ARC only needs to update a single buf and its hdr after a write occurs.
d3c2ae1c 251 *
a6255b7f
DQ
252 * When the L2ARC is in use, it will also take advantage of the b_pabd. The
253 * L2ARC will always write the contents of b_pabd to the L2ARC. This means
2aa34383 254 * that when compressed ARC is enabled that the L2ARC blocks are identical
d3c2ae1c
GW
255 * to the on-disk block in the main data pool. This provides a significant
256 * advantage since the ARC can leverage the bp's checksum when reading from the
257 * L2ARC to determine if the contents are valid. However, if the compressed
2aa34383 258 * ARC is disabled, then the L2ARC's block must be transformed to look
d3c2ae1c
GW
259 * like the physical block in the main data pool before comparing the
260 * checksum and determining its validity.
b5256303
TC
261 *
262 * The L1ARC has a slightly different system for storing encrypted data.
263 * Raw (encrypted + possibly compressed) data has a few subtle differences from
264 * data that is just compressed. The biggest difference is that it is not
265 * possible to decrypt encrypted data (or visa versa) if the keys aren't loaded.
266 * The other difference is that encryption cannot be treated as a suggestion.
267 * If a caller would prefer compressed data, but they actually wind up with
268 * uncompressed data the worst thing that could happen is there might be a
269 * performance hit. If the caller requests encrypted data, however, we must be
270 * sure they actually get it or else secret information could be leaked. Raw
271 * data is stored in hdr->b_crypt_hdr.b_rabd. An encrypted header, therefore,
272 * may have both an encrypted version and a decrypted version of its data at
273 * once. When a caller needs a raw arc_buf_t, it is allocated and the data is
274 * copied out of this header. To avoid complications with b_pabd, raw buffers
275 * cannot be shared.
d3c2ae1c
GW
276 */
277
34dc7c2f
BB
278#include <sys/spa.h>
279#include <sys/zio.h>
d3c2ae1c 280#include <sys/spa_impl.h>
3a17a7a9 281#include <sys/zio_compress.h>
d3c2ae1c 282#include <sys/zio_checksum.h>
34dc7c2f
BB
283#include <sys/zfs_context.h>
284#include <sys/arc.h>
36da08ef 285#include <sys/refcount.h>
b128c09f 286#include <sys/vdev.h>
9babb374 287#include <sys/vdev_impl.h>
e8b96c60 288#include <sys/dsl_pool.h>
a6255b7f 289#include <sys/zio_checksum.h>
ca0bf58d 290#include <sys/multilist.h>
a6255b7f 291#include <sys/abd.h>
b5256303
TC
292#include <sys/zil.h>
293#include <sys/fm/fs/zfs.h>
34dc7c2f 294#ifdef _KERNEL
93ce2b4c 295#include <sys/shrinker.h>
34dc7c2f 296#include <sys/vmsystm.h>
ab26409d 297#include <sys/zpl.h>
e9a77290 298#include <linux/page_compat.h>
34dc7c2f
BB
299#endif
300#include <sys/callb.h>
301#include <sys/kstat.h>
570827e1 302#include <sys/dmu_tx.h>
428870ff 303#include <zfs_fletcher.h>
59ec819a 304#include <sys/arc_impl.h>
49ee64e5 305#include <sys/trace_arc.h>
37fb3e43
PD
306#include <sys/aggsum.h>
307#include <sys/cityhash.h>
34dc7c2f 308
498877ba
MA
309#ifndef _KERNEL
310/* set with ZFS_DEBUG=watch, to enable watchpoints on frozen buffers */
311boolean_t arc_watch = B_FALSE;
312#endif
313
ca0bf58d
PS
314static kmutex_t arc_reclaim_lock;
315static kcondvar_t arc_reclaim_thread_cv;
316static boolean_t arc_reclaim_thread_exit;
317static kcondvar_t arc_reclaim_waiters_cv;
318
e8b96c60 319/*
ca0bf58d
PS
320 * The number of headers to evict in arc_evict_state_impl() before
321 * dropping the sublist lock and evicting from another sublist. A lower
322 * value means we're more likely to evict the "correct" header (i.e. the
323 * oldest header in the arc state), but comes with higher overhead
324 * (i.e. more invocations of arc_evict_state_impl()).
325 */
326int zfs_arc_evict_batch_limit = 10;
327
34dc7c2f 328/* number of seconds before growing cache again */
ca67b33a 329static int arc_grow_retry = 5;
34dc7c2f 330
a6255b7f 331/* shift of arc_c for calculating overflow limit in arc_get_data_impl */
ca67b33a 332int zfs_arc_overflow_shift = 8;
62422785 333
728d6ae9
BB
334/* shift of arc_c for calculating both min and max arc_p */
335static int arc_p_min_shift = 4;
336
d164b209 337/* log2(fraction of arc to reclaim) */
ca67b33a 338static int arc_shrink_shift = 7;
d164b209 339
03b60eee
DB
340/* percent of pagecache to reclaim arc to */
341#ifdef _KERNEL
342static uint_t zfs_arc_pc_percent = 0;
343#endif
344
34dc7c2f 345/*
ca67b33a
MA
346 * log2(fraction of ARC which must be free to allow growing).
347 * I.e. If there is less than arc_c >> arc_no_grow_shift free memory,
348 * when reading a new block into the ARC, we will evict an equal-sized block
349 * from the ARC.
350 *
351 * This must be less than arc_shrink_shift, so that when we shrink the ARC,
352 * we will still not allow it to grow.
34dc7c2f 353 */
ca67b33a 354int arc_no_grow_shift = 5;
bce45ec9 355
49ddb315 356
ca0bf58d
PS
357/*
358 * minimum lifespan of a prefetch block in clock ticks
359 * (initialized in arc_init())
360 */
d4a72f23
TC
361static int arc_min_prefetch_ms;
362static int arc_min_prescient_prefetch_ms;
ca0bf58d 363
e8b96c60
MA
364/*
365 * If this percent of memory is free, don't throttle.
366 */
367int arc_lotsfree_percent = 10;
368
34dc7c2f
BB
369static int arc_dead;
370
b128c09f
BB
371/*
372 * The arc has filled available memory and has now warmed up.
373 */
374static boolean_t arc_warm;
375
d3c2ae1c
GW
376/*
377 * log2 fraction of the zio arena to keep free.
378 */
379int arc_zio_arena_free_shift = 2;
380
34dc7c2f
BB
381/*
382 * These tunables are for performance analysis.
383 */
c28b2279
BB
384unsigned long zfs_arc_max = 0;
385unsigned long zfs_arc_min = 0;
386unsigned long zfs_arc_meta_limit = 0;
ca0bf58d 387unsigned long zfs_arc_meta_min = 0;
25458cbe
TC
388unsigned long zfs_arc_dnode_limit = 0;
389unsigned long zfs_arc_dnode_reduce_percent = 10;
ca67b33a
MA
390int zfs_arc_grow_retry = 0;
391int zfs_arc_shrink_shift = 0;
728d6ae9 392int zfs_arc_p_min_shift = 0;
ca67b33a 393int zfs_arc_average_blocksize = 8 * 1024; /* 8KB */
34dc7c2f 394
dae3e9ea
DB
395/*
396 * ARC dirty data constraints for arc_tempreserve_space() throttle.
397 */
398unsigned long zfs_arc_dirty_limit_percent = 50; /* total dirty data limit */
399unsigned long zfs_arc_anon_limit_percent = 25; /* anon block dirty limit */
400unsigned long zfs_arc_pool_dirty_percent = 20; /* each pool's anon allowance */
401
402/*
403 * Enable or disable compressed arc buffers.
404 */
d3c2ae1c
GW
405int zfs_compressed_arc_enabled = B_TRUE;
406
9907cc1c
G
407/*
408 * ARC will evict meta buffers that exceed arc_meta_limit. This
409 * tunable make arc_meta_limit adjustable for different workloads.
410 */
411unsigned long zfs_arc_meta_limit_percent = 75;
412
413/*
414 * Percentage that can be consumed by dnodes of ARC meta buffers.
415 */
416unsigned long zfs_arc_dnode_limit_percent = 10;
417
bc888666 418/*
ca67b33a 419 * These tunables are Linux specific
bc888666 420 */
11f552fa 421unsigned long zfs_arc_sys_free = 0;
d4a72f23
TC
422int zfs_arc_min_prefetch_ms = 0;
423int zfs_arc_min_prescient_prefetch_ms = 0;
ca67b33a
MA
424int zfs_arc_p_dampener_disable = 1;
425int zfs_arc_meta_prune = 10000;
426int zfs_arc_meta_strategy = ARC_STRATEGY_META_BALANCED;
427int zfs_arc_meta_adjust_restarts = 4096;
7e8bddd0 428int zfs_arc_lotsfree_percent = 10;
bc888666 429
34dc7c2f
BB
430/* The 6 states: */
431static arc_state_t ARC_anon;
432static arc_state_t ARC_mru;
433static arc_state_t ARC_mru_ghost;
434static arc_state_t ARC_mfu;
435static arc_state_t ARC_mfu_ghost;
436static arc_state_t ARC_l2c_only;
437
438typedef struct arc_stats {
439 kstat_named_t arcstat_hits;
440 kstat_named_t arcstat_misses;
441 kstat_named_t arcstat_demand_data_hits;
442 kstat_named_t arcstat_demand_data_misses;
443 kstat_named_t arcstat_demand_metadata_hits;
444 kstat_named_t arcstat_demand_metadata_misses;
445 kstat_named_t arcstat_prefetch_data_hits;
446 kstat_named_t arcstat_prefetch_data_misses;
447 kstat_named_t arcstat_prefetch_metadata_hits;
448 kstat_named_t arcstat_prefetch_metadata_misses;
449 kstat_named_t arcstat_mru_hits;
450 kstat_named_t arcstat_mru_ghost_hits;
451 kstat_named_t arcstat_mfu_hits;
452 kstat_named_t arcstat_mfu_ghost_hits;
453 kstat_named_t arcstat_deleted;
e49f1e20
WA
454 /*
455 * Number of buffers that could not be evicted because the hash lock
456 * was held by another thread. The lock may not necessarily be held
457 * by something using the same buffer, since hash locks are shared
458 * by multiple buffers.
459 */
34dc7c2f 460 kstat_named_t arcstat_mutex_miss;
0873bb63
BB
461 /*
462 * Number of buffers skipped when updating the access state due to the
463 * header having already been released after acquiring the hash lock.
464 */
465 kstat_named_t arcstat_access_skip;
e49f1e20
WA
466 /*
467 * Number of buffers skipped because they have I/O in progress, are
0873bb63 468 * indirect prefetch buffers that have not lived long enough, or are
e49f1e20
WA
469 * not from the spa we're trying to evict from.
470 */
34dc7c2f 471 kstat_named_t arcstat_evict_skip;
ca0bf58d
PS
472 /*
473 * Number of times arc_evict_state() was unable to evict enough
474 * buffers to reach its target amount.
475 */
476 kstat_named_t arcstat_evict_not_enough;
428870ff
BB
477 kstat_named_t arcstat_evict_l2_cached;
478 kstat_named_t arcstat_evict_l2_eligible;
479 kstat_named_t arcstat_evict_l2_ineligible;
ca0bf58d 480 kstat_named_t arcstat_evict_l2_skip;
34dc7c2f
BB
481 kstat_named_t arcstat_hash_elements;
482 kstat_named_t arcstat_hash_elements_max;
483 kstat_named_t arcstat_hash_collisions;
484 kstat_named_t arcstat_hash_chains;
485 kstat_named_t arcstat_hash_chain_max;
486 kstat_named_t arcstat_p;
487 kstat_named_t arcstat_c;
488 kstat_named_t arcstat_c_min;
489 kstat_named_t arcstat_c_max;
37fb3e43 490 /* Not updated directly; only synced in arc_kstat_update. */
34dc7c2f 491 kstat_named_t arcstat_size;
d3c2ae1c 492 /*
a6255b7f 493 * Number of compressed bytes stored in the arc_buf_hdr_t's b_pabd.
d3c2ae1c
GW
494 * Note that the compressed bytes may match the uncompressed bytes
495 * if the block is either not compressed or compressed arc is disabled.
496 */
497 kstat_named_t arcstat_compressed_size;
498 /*
a6255b7f 499 * Uncompressed size of the data stored in b_pabd. If compressed
d3c2ae1c
GW
500 * arc is disabled then this value will be identical to the stat
501 * above.
502 */
503 kstat_named_t arcstat_uncompressed_size;
504 /*
505 * Number of bytes stored in all the arc_buf_t's. This is classified
506 * as "overhead" since this data is typically short-lived and will
507 * be evicted from the arc when it becomes unreferenced unless the
508 * zfs_keep_uncompressed_metadata or zfs_keep_uncompressed_level
509 * values have been set (see comment in dbuf.c for more information).
510 */
511 kstat_named_t arcstat_overhead_size;
500445c0
PS
512 /*
513 * Number of bytes consumed by internal ARC structures necessary
514 * for tracking purposes; these structures are not actually
515 * backed by ARC buffers. This includes arc_buf_hdr_t structures
516 * (allocated via arc_buf_hdr_t_full and arc_buf_hdr_t_l2only
517 * caches), and arc_buf_t structures (allocated via arc_buf_t
518 * cache).
37fb3e43 519 * Not updated directly; only synced in arc_kstat_update.
500445c0 520 */
34dc7c2f 521 kstat_named_t arcstat_hdr_size;
500445c0
PS
522 /*
523 * Number of bytes consumed by ARC buffers of type equal to
524 * ARC_BUFC_DATA. This is generally consumed by buffers backing
525 * on disk user data (e.g. plain file contents).
37fb3e43 526 * Not updated directly; only synced in arc_kstat_update.
500445c0 527 */
d164b209 528 kstat_named_t arcstat_data_size;
500445c0
PS
529 /*
530 * Number of bytes consumed by ARC buffers of type equal to
531 * ARC_BUFC_METADATA. This is generally consumed by buffers
532 * backing on disk data that is used for internal ZFS
533 * structures (e.g. ZAP, dnode, indirect blocks, etc).
37fb3e43 534 * Not updated directly; only synced in arc_kstat_update.
500445c0
PS
535 */
536 kstat_named_t arcstat_metadata_size;
537 /*
25458cbe 538 * Number of bytes consumed by dmu_buf_impl_t objects.
37fb3e43 539 * Not updated directly; only synced in arc_kstat_update.
500445c0 540 */
25458cbe
TC
541 kstat_named_t arcstat_dbuf_size;
542 /*
543 * Number of bytes consumed by dnode_t objects.
37fb3e43 544 * Not updated directly; only synced in arc_kstat_update.
25458cbe
TC
545 */
546 kstat_named_t arcstat_dnode_size;
547 /*
548 * Number of bytes consumed by bonus buffers.
37fb3e43 549 * Not updated directly; only synced in arc_kstat_update.
25458cbe
TC
550 */
551 kstat_named_t arcstat_bonus_size;
500445c0
PS
552 /*
553 * Total number of bytes consumed by ARC buffers residing in the
554 * arc_anon state. This includes *all* buffers in the arc_anon
555 * state; e.g. data, metadata, evictable, and unevictable buffers
556 * are all included in this value.
37fb3e43 557 * Not updated directly; only synced in arc_kstat_update.
500445c0 558 */
13be560d 559 kstat_named_t arcstat_anon_size;
500445c0
PS
560 /*
561 * Number of bytes consumed by ARC buffers that meet the
562 * following criteria: backing buffers of type ARC_BUFC_DATA,
563 * residing in the arc_anon state, and are eligible for eviction
564 * (e.g. have no outstanding holds on the buffer).
37fb3e43 565 * Not updated directly; only synced in arc_kstat_update.
500445c0
PS
566 */
567 kstat_named_t arcstat_anon_evictable_data;
568 /*
569 * Number of bytes consumed by ARC buffers that meet the
570 * following criteria: backing buffers of type ARC_BUFC_METADATA,
571 * residing in the arc_anon state, and are eligible for eviction
572 * (e.g. have no outstanding holds on the buffer).
37fb3e43 573 * Not updated directly; only synced in arc_kstat_update.
500445c0
PS
574 */
575 kstat_named_t arcstat_anon_evictable_metadata;
576 /*
577 * Total number of bytes consumed by ARC buffers residing in the
578 * arc_mru state. This includes *all* buffers in the arc_mru
579 * state; e.g. data, metadata, evictable, and unevictable buffers
580 * are all included in this value.
37fb3e43 581 * Not updated directly; only synced in arc_kstat_update.
500445c0 582 */
13be560d 583 kstat_named_t arcstat_mru_size;
500445c0
PS
584 /*
585 * Number of bytes consumed by ARC buffers that meet the
586 * following criteria: backing buffers of type ARC_BUFC_DATA,
587 * residing in the arc_mru state, and are eligible for eviction
588 * (e.g. have no outstanding holds on the buffer).
37fb3e43 589 * Not updated directly; only synced in arc_kstat_update.
500445c0
PS
590 */
591 kstat_named_t arcstat_mru_evictable_data;
592 /*
593 * Number of bytes consumed by ARC buffers that meet the
594 * following criteria: backing buffers of type ARC_BUFC_METADATA,
595 * residing in the arc_mru state, and are eligible for eviction
596 * (e.g. have no outstanding holds on the buffer).
37fb3e43 597 * Not updated directly; only synced in arc_kstat_update.
500445c0
PS
598 */
599 kstat_named_t arcstat_mru_evictable_metadata;
600 /*
601 * Total number of bytes that *would have been* consumed by ARC
602 * buffers in the arc_mru_ghost state. The key thing to note
603 * here, is the fact that this size doesn't actually indicate
604 * RAM consumption. The ghost lists only consist of headers and
605 * don't actually have ARC buffers linked off of these headers.
606 * Thus, *if* the headers had associated ARC buffers, these
607 * buffers *would have* consumed this number of bytes.
37fb3e43 608 * Not updated directly; only synced in arc_kstat_update.
500445c0 609 */
13be560d 610 kstat_named_t arcstat_mru_ghost_size;
500445c0
PS
611 /*
612 * Number of bytes that *would have been* consumed by ARC
613 * buffers that are eligible for eviction, of type
614 * ARC_BUFC_DATA, and linked off the arc_mru_ghost state.
37fb3e43 615 * Not updated directly; only synced in arc_kstat_update.
500445c0
PS
616 */
617 kstat_named_t arcstat_mru_ghost_evictable_data;
618 /*
619 * Number of bytes that *would have been* consumed by ARC
620 * buffers that are eligible for eviction, of type
621 * ARC_BUFC_METADATA, and linked off the arc_mru_ghost state.
37fb3e43 622 * Not updated directly; only synced in arc_kstat_update.
500445c0
PS
623 */
624 kstat_named_t arcstat_mru_ghost_evictable_metadata;
625 /*
626 * Total number of bytes consumed by ARC buffers residing in the
627 * arc_mfu state. This includes *all* buffers in the arc_mfu
628 * state; e.g. data, metadata, evictable, and unevictable buffers
629 * are all included in this value.
37fb3e43 630 * Not updated directly; only synced in arc_kstat_update.
500445c0 631 */
13be560d 632 kstat_named_t arcstat_mfu_size;
500445c0
PS
633 /*
634 * Number of bytes consumed by ARC buffers that are eligible for
635 * eviction, of type ARC_BUFC_DATA, and reside in the arc_mfu
636 * state.
37fb3e43 637 * Not updated directly; only synced in arc_kstat_update.
500445c0
PS
638 */
639 kstat_named_t arcstat_mfu_evictable_data;
640 /*
641 * Number of bytes consumed by ARC buffers that are eligible for
642 * eviction, of type ARC_BUFC_METADATA, and reside in the
643 * arc_mfu state.
37fb3e43 644 * Not updated directly; only synced in arc_kstat_update.
500445c0
PS
645 */
646 kstat_named_t arcstat_mfu_evictable_metadata;
647 /*
648 * Total number of bytes that *would have been* consumed by ARC
649 * buffers in the arc_mfu_ghost state. See the comment above
650 * arcstat_mru_ghost_size for more details.
37fb3e43 651 * Not updated directly; only synced in arc_kstat_update.
500445c0 652 */
13be560d 653 kstat_named_t arcstat_mfu_ghost_size;
500445c0
PS
654 /*
655 * Number of bytes that *would have been* consumed by ARC
656 * buffers that are eligible for eviction, of type
657 * ARC_BUFC_DATA, and linked off the arc_mfu_ghost state.
37fb3e43 658 * Not updated directly; only synced in arc_kstat_update.
500445c0
PS
659 */
660 kstat_named_t arcstat_mfu_ghost_evictable_data;
661 /*
662 * Number of bytes that *would have been* consumed by ARC
663 * buffers that are eligible for eviction, of type
664 * ARC_BUFC_METADATA, and linked off the arc_mru_ghost state.
37fb3e43 665 * Not updated directly; only synced in arc_kstat_update.
500445c0
PS
666 */
667 kstat_named_t arcstat_mfu_ghost_evictable_metadata;
34dc7c2f
BB
668 kstat_named_t arcstat_l2_hits;
669 kstat_named_t arcstat_l2_misses;
670 kstat_named_t arcstat_l2_feeds;
671 kstat_named_t arcstat_l2_rw_clash;
d164b209
BB
672 kstat_named_t arcstat_l2_read_bytes;
673 kstat_named_t arcstat_l2_write_bytes;
34dc7c2f
BB
674 kstat_named_t arcstat_l2_writes_sent;
675 kstat_named_t arcstat_l2_writes_done;
676 kstat_named_t arcstat_l2_writes_error;
ca0bf58d 677 kstat_named_t arcstat_l2_writes_lock_retry;
34dc7c2f
BB
678 kstat_named_t arcstat_l2_evict_lock_retry;
679 kstat_named_t arcstat_l2_evict_reading;
b9541d6b 680 kstat_named_t arcstat_l2_evict_l1cached;
34dc7c2f
BB
681 kstat_named_t arcstat_l2_free_on_write;
682 kstat_named_t arcstat_l2_abort_lowmem;
683 kstat_named_t arcstat_l2_cksum_bad;
684 kstat_named_t arcstat_l2_io_error;
01850391
AG
685 kstat_named_t arcstat_l2_lsize;
686 kstat_named_t arcstat_l2_psize;
37fb3e43 687 /* Not updated directly; only synced in arc_kstat_update. */
34dc7c2f
BB
688 kstat_named_t arcstat_l2_hdr_size;
689 kstat_named_t arcstat_memory_throttle_count;
7cb67b45
BB
690 kstat_named_t arcstat_memory_direct_count;
691 kstat_named_t arcstat_memory_indirect_count;
70f02287
BB
692 kstat_named_t arcstat_memory_all_bytes;
693 kstat_named_t arcstat_memory_free_bytes;
694 kstat_named_t arcstat_memory_available_bytes;
1834f2d8
BB
695 kstat_named_t arcstat_no_grow;
696 kstat_named_t arcstat_tempreserve;
697 kstat_named_t arcstat_loaned_bytes;
ab26409d 698 kstat_named_t arcstat_prune;
37fb3e43 699 /* Not updated directly; only synced in arc_kstat_update. */
1834f2d8
BB
700 kstat_named_t arcstat_meta_used;
701 kstat_named_t arcstat_meta_limit;
25458cbe 702 kstat_named_t arcstat_dnode_limit;
1834f2d8 703 kstat_named_t arcstat_meta_max;
ca0bf58d 704 kstat_named_t arcstat_meta_min;
a8b2e306 705 kstat_named_t arcstat_async_upgrade_sync;
7f60329a 706 kstat_named_t arcstat_demand_hit_predictive_prefetch;
d4a72f23 707 kstat_named_t arcstat_demand_hit_prescient_prefetch;
11f552fa
BB
708 kstat_named_t arcstat_need_free;
709 kstat_named_t arcstat_sys_free;
b5256303 710 kstat_named_t arcstat_raw_size;
34dc7c2f
BB
711} arc_stats_t;
712
713static arc_stats_t arc_stats = {
714 { "hits", KSTAT_DATA_UINT64 },
715 { "misses", KSTAT_DATA_UINT64 },
716 { "demand_data_hits", KSTAT_DATA_UINT64 },
717 { "demand_data_misses", KSTAT_DATA_UINT64 },
718 { "demand_metadata_hits", KSTAT_DATA_UINT64 },
719 { "demand_metadata_misses", KSTAT_DATA_UINT64 },
720 { "prefetch_data_hits", KSTAT_DATA_UINT64 },
721 { "prefetch_data_misses", KSTAT_DATA_UINT64 },
722 { "prefetch_metadata_hits", KSTAT_DATA_UINT64 },
723 { "prefetch_metadata_misses", KSTAT_DATA_UINT64 },
724 { "mru_hits", KSTAT_DATA_UINT64 },
725 { "mru_ghost_hits", KSTAT_DATA_UINT64 },
726 { "mfu_hits", KSTAT_DATA_UINT64 },
727 { "mfu_ghost_hits", KSTAT_DATA_UINT64 },
728 { "deleted", KSTAT_DATA_UINT64 },
34dc7c2f 729 { "mutex_miss", KSTAT_DATA_UINT64 },
0873bb63 730 { "access_skip", KSTAT_DATA_UINT64 },
34dc7c2f 731 { "evict_skip", KSTAT_DATA_UINT64 },
ca0bf58d 732 { "evict_not_enough", KSTAT_DATA_UINT64 },
428870ff
BB
733 { "evict_l2_cached", KSTAT_DATA_UINT64 },
734 { "evict_l2_eligible", KSTAT_DATA_UINT64 },
735 { "evict_l2_ineligible", KSTAT_DATA_UINT64 },
ca0bf58d 736 { "evict_l2_skip", KSTAT_DATA_UINT64 },
34dc7c2f
BB
737 { "hash_elements", KSTAT_DATA_UINT64 },
738 { "hash_elements_max", KSTAT_DATA_UINT64 },
739 { "hash_collisions", KSTAT_DATA_UINT64 },
740 { "hash_chains", KSTAT_DATA_UINT64 },
741 { "hash_chain_max", KSTAT_DATA_UINT64 },
742 { "p", KSTAT_DATA_UINT64 },
743 { "c", KSTAT_DATA_UINT64 },
744 { "c_min", KSTAT_DATA_UINT64 },
745 { "c_max", KSTAT_DATA_UINT64 },
746 { "size", KSTAT_DATA_UINT64 },
d3c2ae1c
GW
747 { "compressed_size", KSTAT_DATA_UINT64 },
748 { "uncompressed_size", KSTAT_DATA_UINT64 },
749 { "overhead_size", KSTAT_DATA_UINT64 },
34dc7c2f 750 { "hdr_size", KSTAT_DATA_UINT64 },
d164b209 751 { "data_size", KSTAT_DATA_UINT64 },
500445c0 752 { "metadata_size", KSTAT_DATA_UINT64 },
25458cbe
TC
753 { "dbuf_size", KSTAT_DATA_UINT64 },
754 { "dnode_size", KSTAT_DATA_UINT64 },
755 { "bonus_size", KSTAT_DATA_UINT64 },
13be560d 756 { "anon_size", KSTAT_DATA_UINT64 },
500445c0
PS
757 { "anon_evictable_data", KSTAT_DATA_UINT64 },
758 { "anon_evictable_metadata", KSTAT_DATA_UINT64 },
13be560d 759 { "mru_size", KSTAT_DATA_UINT64 },
500445c0
PS
760 { "mru_evictable_data", KSTAT_DATA_UINT64 },
761 { "mru_evictable_metadata", KSTAT_DATA_UINT64 },
13be560d 762 { "mru_ghost_size", KSTAT_DATA_UINT64 },
500445c0
PS
763 { "mru_ghost_evictable_data", KSTAT_DATA_UINT64 },
764 { "mru_ghost_evictable_metadata", KSTAT_DATA_UINT64 },
13be560d 765 { "mfu_size", KSTAT_DATA_UINT64 },
500445c0
PS
766 { "mfu_evictable_data", KSTAT_DATA_UINT64 },
767 { "mfu_evictable_metadata", KSTAT_DATA_UINT64 },
13be560d 768 { "mfu_ghost_size", KSTAT_DATA_UINT64 },
500445c0
PS
769 { "mfu_ghost_evictable_data", KSTAT_DATA_UINT64 },
770 { "mfu_ghost_evictable_metadata", KSTAT_DATA_UINT64 },
34dc7c2f
BB
771 { "l2_hits", KSTAT_DATA_UINT64 },
772 { "l2_misses", KSTAT_DATA_UINT64 },
773 { "l2_feeds", KSTAT_DATA_UINT64 },
774 { "l2_rw_clash", KSTAT_DATA_UINT64 },
d164b209
BB
775 { "l2_read_bytes", KSTAT_DATA_UINT64 },
776 { "l2_write_bytes", KSTAT_DATA_UINT64 },
34dc7c2f
BB
777 { "l2_writes_sent", KSTAT_DATA_UINT64 },
778 { "l2_writes_done", KSTAT_DATA_UINT64 },
779 { "l2_writes_error", KSTAT_DATA_UINT64 },
ca0bf58d 780 { "l2_writes_lock_retry", KSTAT_DATA_UINT64 },
34dc7c2f
BB
781 { "l2_evict_lock_retry", KSTAT_DATA_UINT64 },
782 { "l2_evict_reading", KSTAT_DATA_UINT64 },
b9541d6b 783 { "l2_evict_l1cached", KSTAT_DATA_UINT64 },
34dc7c2f
BB
784 { "l2_free_on_write", KSTAT_DATA_UINT64 },
785 { "l2_abort_lowmem", KSTAT_DATA_UINT64 },
786 { "l2_cksum_bad", KSTAT_DATA_UINT64 },
787 { "l2_io_error", KSTAT_DATA_UINT64 },
788 { "l2_size", KSTAT_DATA_UINT64 },
3a17a7a9 789 { "l2_asize", KSTAT_DATA_UINT64 },
34dc7c2f 790 { "l2_hdr_size", KSTAT_DATA_UINT64 },
1834f2d8 791 { "memory_throttle_count", KSTAT_DATA_UINT64 },
7cb67b45
BB
792 { "memory_direct_count", KSTAT_DATA_UINT64 },
793 { "memory_indirect_count", KSTAT_DATA_UINT64 },
70f02287
BB
794 { "memory_all_bytes", KSTAT_DATA_UINT64 },
795 { "memory_free_bytes", KSTAT_DATA_UINT64 },
796 { "memory_available_bytes", KSTAT_DATA_INT64 },
1834f2d8
BB
797 { "arc_no_grow", KSTAT_DATA_UINT64 },
798 { "arc_tempreserve", KSTAT_DATA_UINT64 },
799 { "arc_loaned_bytes", KSTAT_DATA_UINT64 },
ab26409d 800 { "arc_prune", KSTAT_DATA_UINT64 },
1834f2d8
BB
801 { "arc_meta_used", KSTAT_DATA_UINT64 },
802 { "arc_meta_limit", KSTAT_DATA_UINT64 },
25458cbe 803 { "arc_dnode_limit", KSTAT_DATA_UINT64 },
1834f2d8 804 { "arc_meta_max", KSTAT_DATA_UINT64 },
11f552fa 805 { "arc_meta_min", KSTAT_DATA_UINT64 },
a8b2e306 806 { "async_upgrade_sync", KSTAT_DATA_UINT64 },
7f60329a 807 { "demand_hit_predictive_prefetch", KSTAT_DATA_UINT64 },
d4a72f23 808 { "demand_hit_prescient_prefetch", KSTAT_DATA_UINT64 },
11f552fa 809 { "arc_need_free", KSTAT_DATA_UINT64 },
b5256303
TC
810 { "arc_sys_free", KSTAT_DATA_UINT64 },
811 { "arc_raw_size", KSTAT_DATA_UINT64 }
34dc7c2f
BB
812};
813
814#define ARCSTAT(stat) (arc_stats.stat.value.ui64)
815
816#define ARCSTAT_INCR(stat, val) \
d3cc8b15 817 atomic_add_64(&arc_stats.stat.value.ui64, (val))
34dc7c2f 818
428870ff 819#define ARCSTAT_BUMP(stat) ARCSTAT_INCR(stat, 1)
34dc7c2f
BB
820#define ARCSTAT_BUMPDOWN(stat) ARCSTAT_INCR(stat, -1)
821
822#define ARCSTAT_MAX(stat, val) { \
823 uint64_t m; \
824 while ((val) > (m = arc_stats.stat.value.ui64) && \
825 (m != atomic_cas_64(&arc_stats.stat.value.ui64, m, (val)))) \
826 continue; \
827}
828
829#define ARCSTAT_MAXSTAT(stat) \
830 ARCSTAT_MAX(stat##_max, arc_stats.stat.value.ui64)
831
832/*
833 * We define a macro to allow ARC hits/misses to be easily broken down by
834 * two separate conditions, giving a total of four different subtypes for
835 * each of hits and misses (so eight statistics total).
836 */
837#define ARCSTAT_CONDSTAT(cond1, stat1, notstat1, cond2, stat2, notstat2, stat) \
838 if (cond1) { \
839 if (cond2) { \
840 ARCSTAT_BUMP(arcstat_##stat1##_##stat2##_##stat); \
841 } else { \
842 ARCSTAT_BUMP(arcstat_##stat1##_##notstat2##_##stat); \
843 } \
844 } else { \
845 if (cond2) { \
846 ARCSTAT_BUMP(arcstat_##notstat1##_##stat2##_##stat); \
847 } else { \
848 ARCSTAT_BUMP(arcstat_##notstat1##_##notstat2##_##stat);\
849 } \
850 }
851
852kstat_t *arc_ksp;
428870ff 853static arc_state_t *arc_anon;
34dc7c2f
BB
854static arc_state_t *arc_mru;
855static arc_state_t *arc_mru_ghost;
856static arc_state_t *arc_mfu;
857static arc_state_t *arc_mfu_ghost;
858static arc_state_t *arc_l2c_only;
859
860/*
861 * There are several ARC variables that are critical to export as kstats --
862 * but we don't want to have to grovel around in the kstat whenever we wish to
863 * manipulate them. For these variables, we therefore define them to be in
864 * terms of the statistic variable. This assures that we are not introducing
865 * the possibility of inconsistency by having shadow copies of the variables,
866 * while still allowing the code to be readable.
867 */
34dc7c2f
BB
868#define arc_p ARCSTAT(arcstat_p) /* target size of MRU */
869#define arc_c ARCSTAT(arcstat_c) /* target size of cache */
870#define arc_c_min ARCSTAT(arcstat_c_min) /* min target cache size */
871#define arc_c_max ARCSTAT(arcstat_c_max) /* max target cache size */
d3c2ae1c 872#define arc_no_grow ARCSTAT(arcstat_no_grow) /* do not grow cache size */
1834f2d8
BB
873#define arc_tempreserve ARCSTAT(arcstat_tempreserve)
874#define arc_loaned_bytes ARCSTAT(arcstat_loaned_bytes)
23c0a133 875#define arc_meta_limit ARCSTAT(arcstat_meta_limit) /* max size for metadata */
25458cbe 876#define arc_dnode_limit ARCSTAT(arcstat_dnode_limit) /* max size for dnodes */
ca0bf58d 877#define arc_meta_min ARCSTAT(arcstat_meta_min) /* min size for metadata */
23c0a133 878#define arc_meta_max ARCSTAT(arcstat_meta_max) /* max size of metadata */
11f552fa
BB
879#define arc_need_free ARCSTAT(arcstat_need_free) /* bytes to be freed */
880#define arc_sys_free ARCSTAT(arcstat_sys_free) /* target system free bytes */
34dc7c2f 881
b5256303
TC
882/* size of all b_rabd's in entire arc */
883#define arc_raw_size ARCSTAT(arcstat_raw_size)
d3c2ae1c
GW
884/* compressed size of entire arc */
885#define arc_compressed_size ARCSTAT(arcstat_compressed_size)
886/* uncompressed size of entire arc */
887#define arc_uncompressed_size ARCSTAT(arcstat_uncompressed_size)
888/* number of bytes in the arc from arc_buf_t's */
889#define arc_overhead_size ARCSTAT(arcstat_overhead_size)
3a17a7a9 890
37fb3e43
PD
891/*
892 * There are also some ARC variables that we want to export, but that are
893 * updated so often that having the canonical representation be the statistic
894 * variable causes a performance bottleneck. We want to use aggsum_t's for these
895 * instead, but still be able to export the kstat in the same way as before.
896 * The solution is to always use the aggsum version, except in the kstat update
897 * callback.
898 */
899aggsum_t arc_size;
900aggsum_t arc_meta_used;
901aggsum_t astat_data_size;
902aggsum_t astat_metadata_size;
903aggsum_t astat_dbuf_size;
904aggsum_t astat_dnode_size;
905aggsum_t astat_bonus_size;
906aggsum_t astat_hdr_size;
907aggsum_t astat_l2_hdr_size;
908
ab26409d
BB
909static list_t arc_prune_list;
910static kmutex_t arc_prune_mtx;
f6046738 911static taskq_t *arc_prune_taskq;
428870ff 912
34dc7c2f
BB
913#define GHOST_STATE(state) \
914 ((state) == arc_mru_ghost || (state) == arc_mfu_ghost || \
915 (state) == arc_l2c_only)
916
2a432414
GW
917#define HDR_IN_HASH_TABLE(hdr) ((hdr)->b_flags & ARC_FLAG_IN_HASH_TABLE)
918#define HDR_IO_IN_PROGRESS(hdr) ((hdr)->b_flags & ARC_FLAG_IO_IN_PROGRESS)
919#define HDR_IO_ERROR(hdr) ((hdr)->b_flags & ARC_FLAG_IO_ERROR)
920#define HDR_PREFETCH(hdr) ((hdr)->b_flags & ARC_FLAG_PREFETCH)
d4a72f23
TC
921#define HDR_PRESCIENT_PREFETCH(hdr) \
922 ((hdr)->b_flags & ARC_FLAG_PRESCIENT_PREFETCH)
d3c2ae1c
GW
923#define HDR_COMPRESSION_ENABLED(hdr) \
924 ((hdr)->b_flags & ARC_FLAG_COMPRESSED_ARC)
b9541d6b 925
2a432414
GW
926#define HDR_L2CACHE(hdr) ((hdr)->b_flags & ARC_FLAG_L2CACHE)
927#define HDR_L2_READING(hdr) \
d3c2ae1c
GW
928 (((hdr)->b_flags & ARC_FLAG_IO_IN_PROGRESS) && \
929 ((hdr)->b_flags & ARC_FLAG_HAS_L2HDR))
2a432414
GW
930#define HDR_L2_WRITING(hdr) ((hdr)->b_flags & ARC_FLAG_L2_WRITING)
931#define HDR_L2_EVICTED(hdr) ((hdr)->b_flags & ARC_FLAG_L2_EVICTED)
932#define HDR_L2_WRITE_HEAD(hdr) ((hdr)->b_flags & ARC_FLAG_L2_WRITE_HEAD)
b5256303
TC
933#define HDR_PROTECTED(hdr) ((hdr)->b_flags & ARC_FLAG_PROTECTED)
934#define HDR_NOAUTH(hdr) ((hdr)->b_flags & ARC_FLAG_NOAUTH)
d3c2ae1c 935#define HDR_SHARED_DATA(hdr) ((hdr)->b_flags & ARC_FLAG_SHARED_DATA)
34dc7c2f 936
b9541d6b 937#define HDR_ISTYPE_METADATA(hdr) \
d3c2ae1c 938 ((hdr)->b_flags & ARC_FLAG_BUFC_METADATA)
b9541d6b
CW
939#define HDR_ISTYPE_DATA(hdr) (!HDR_ISTYPE_METADATA(hdr))
940
941#define HDR_HAS_L1HDR(hdr) ((hdr)->b_flags & ARC_FLAG_HAS_L1HDR)
942#define HDR_HAS_L2HDR(hdr) ((hdr)->b_flags & ARC_FLAG_HAS_L2HDR)
b5256303
TC
943#define HDR_HAS_RABD(hdr) \
944 (HDR_HAS_L1HDR(hdr) && HDR_PROTECTED(hdr) && \
945 (hdr)->b_crypt_hdr.b_rabd != NULL)
946#define HDR_ENCRYPTED(hdr) \
947 (HDR_PROTECTED(hdr) && DMU_OT_IS_ENCRYPTED((hdr)->b_crypt_hdr.b_ot))
948#define HDR_AUTHENTICATED(hdr) \
949 (HDR_PROTECTED(hdr) && !DMU_OT_IS_ENCRYPTED((hdr)->b_crypt_hdr.b_ot))
b9541d6b 950
d3c2ae1c
GW
951/* For storing compression mode in b_flags */
952#define HDR_COMPRESS_OFFSET (highbit64(ARC_FLAG_COMPRESS_0) - 1)
953
954#define HDR_GET_COMPRESS(hdr) ((enum zio_compress)BF32_GET((hdr)->b_flags, \
955 HDR_COMPRESS_OFFSET, SPA_COMPRESSBITS))
956#define HDR_SET_COMPRESS(hdr, cmp) BF32_SET((hdr)->b_flags, \
957 HDR_COMPRESS_OFFSET, SPA_COMPRESSBITS, (cmp));
958
959#define ARC_BUF_LAST(buf) ((buf)->b_next == NULL)
524b4217
DK
960#define ARC_BUF_SHARED(buf) ((buf)->b_flags & ARC_BUF_FLAG_SHARED)
961#define ARC_BUF_COMPRESSED(buf) ((buf)->b_flags & ARC_BUF_FLAG_COMPRESSED)
b5256303 962#define ARC_BUF_ENCRYPTED(buf) ((buf)->b_flags & ARC_BUF_FLAG_ENCRYPTED)
d3c2ae1c 963
34dc7c2f
BB
964/*
965 * Other sizes
966 */
967
b5256303
TC
968#define HDR_FULL_CRYPT_SIZE ((int64_t)sizeof (arc_buf_hdr_t))
969#define HDR_FULL_SIZE ((int64_t)offsetof(arc_buf_hdr_t, b_crypt_hdr))
b9541d6b 970#define HDR_L2ONLY_SIZE ((int64_t)offsetof(arc_buf_hdr_t, b_l1hdr))
34dc7c2f
BB
971
972/*
973 * Hash table routines
974 */
975
00b46022
BB
976#define HT_LOCK_ALIGN 64
977#define HT_LOCK_PAD (P2NPHASE(sizeof (kmutex_t), (HT_LOCK_ALIGN)))
34dc7c2f
BB
978
979struct ht_lock {
980 kmutex_t ht_lock;
981#ifdef _KERNEL
00b46022 982 unsigned char pad[HT_LOCK_PAD];
34dc7c2f
BB
983#endif
984};
985
b31d8ea7 986#define BUF_LOCKS 8192
34dc7c2f
BB
987typedef struct buf_hash_table {
988 uint64_t ht_mask;
989 arc_buf_hdr_t **ht_table;
990 struct ht_lock ht_locks[BUF_LOCKS];
991} buf_hash_table_t;
992
993static buf_hash_table_t buf_hash_table;
994
995#define BUF_HASH_INDEX(spa, dva, birth) \
996 (buf_hash(spa, dva, birth) & buf_hash_table.ht_mask)
997#define BUF_HASH_LOCK_NTRY(idx) (buf_hash_table.ht_locks[idx & (BUF_LOCKS-1)])
998#define BUF_HASH_LOCK(idx) (&(BUF_HASH_LOCK_NTRY(idx).ht_lock))
428870ff
BB
999#define HDR_LOCK(hdr) \
1000 (BUF_HASH_LOCK(BUF_HASH_INDEX(hdr->b_spa, &hdr->b_dva, hdr->b_birth)))
34dc7c2f
BB
1001
1002uint64_t zfs_crc64_table[256];
1003
1004/*
1005 * Level 2 ARC
1006 */
1007
1008#define L2ARC_WRITE_SIZE (8 * 1024 * 1024) /* initial write max */
3a17a7a9 1009#define L2ARC_HEADROOM 2 /* num of writes */
8a09d5fd 1010
3a17a7a9
SK
1011/*
1012 * If we discover during ARC scan any buffers to be compressed, we boost
1013 * our headroom for the next scanning cycle by this percentage multiple.
1014 */
1015#define L2ARC_HEADROOM_BOOST 200
d164b209
BB
1016#define L2ARC_FEED_SECS 1 /* caching interval secs */
1017#define L2ARC_FEED_MIN_MS 200 /* min caching interval ms */
34dc7c2f 1018
4aafab91
G
1019/*
1020 * We can feed L2ARC from two states of ARC buffers, mru and mfu,
1021 * and each of the state has two types: data and metadata.
1022 */
1023#define L2ARC_FEED_TYPES 4
1024
34dc7c2f
BB
1025#define l2arc_writes_sent ARCSTAT(arcstat_l2_writes_sent)
1026#define l2arc_writes_done ARCSTAT(arcstat_l2_writes_done)
1027
d3cc8b15 1028/* L2ARC Performance Tunables */
abd8610c
BB
1029unsigned long l2arc_write_max = L2ARC_WRITE_SIZE; /* def max write size */
1030unsigned long l2arc_write_boost = L2ARC_WRITE_SIZE; /* extra warmup write */
1031unsigned long l2arc_headroom = L2ARC_HEADROOM; /* # of dev writes */
3a17a7a9 1032unsigned long l2arc_headroom_boost = L2ARC_HEADROOM_BOOST;
abd8610c
BB
1033unsigned long l2arc_feed_secs = L2ARC_FEED_SECS; /* interval seconds */
1034unsigned long l2arc_feed_min_ms = L2ARC_FEED_MIN_MS; /* min interval msecs */
1035int l2arc_noprefetch = B_TRUE; /* don't cache prefetch bufs */
1036int l2arc_feed_again = B_TRUE; /* turbo warmup */
c93504f0 1037int l2arc_norw = B_FALSE; /* no reads during writes */
34dc7c2f
BB
1038
1039/*
1040 * L2ARC Internals
1041 */
34dc7c2f
BB
1042static list_t L2ARC_dev_list; /* device list */
1043static list_t *l2arc_dev_list; /* device list pointer */
1044static kmutex_t l2arc_dev_mtx; /* device list mutex */
1045static l2arc_dev_t *l2arc_dev_last; /* last device used */
34dc7c2f
BB
1046static list_t L2ARC_free_on_write; /* free after write buf list */
1047static list_t *l2arc_free_on_write; /* free after write list ptr */
1048static kmutex_t l2arc_free_on_write_mtx; /* mutex for list */
1049static uint64_t l2arc_ndev; /* number of devices */
1050
1051typedef struct l2arc_read_callback {
2aa34383 1052 arc_buf_hdr_t *l2rcb_hdr; /* read header */
3a17a7a9 1053 blkptr_t l2rcb_bp; /* original blkptr */
5dbd68a3 1054 zbookmark_phys_t l2rcb_zb; /* original bookmark */
3a17a7a9 1055 int l2rcb_flags; /* original flags */
82710e99 1056 abd_t *l2rcb_abd; /* temporary buffer */
34dc7c2f
BB
1057} l2arc_read_callback_t;
1058
34dc7c2f
BB
1059typedef struct l2arc_data_free {
1060 /* protected by l2arc_free_on_write_mtx */
a6255b7f 1061 abd_t *l2df_abd;
34dc7c2f 1062 size_t l2df_size;
d3c2ae1c 1063 arc_buf_contents_t l2df_type;
34dc7c2f
BB
1064 list_node_t l2df_list_node;
1065} l2arc_data_free_t;
1066
b5256303
TC
1067typedef enum arc_fill_flags {
1068 ARC_FILL_LOCKED = 1 << 0, /* hdr lock is held */
1069 ARC_FILL_COMPRESSED = 1 << 1, /* fill with compressed data */
1070 ARC_FILL_ENCRYPTED = 1 << 2, /* fill with encrypted data */
1071 ARC_FILL_NOAUTH = 1 << 3, /* don't attempt to authenticate */
1072 ARC_FILL_IN_PLACE = 1 << 4 /* fill in place (special case) */
1073} arc_fill_flags_t;
1074
34dc7c2f
BB
1075static kmutex_t l2arc_feed_thr_lock;
1076static kcondvar_t l2arc_feed_thr_cv;
1077static uint8_t l2arc_thread_exit;
1078
a6255b7f 1079static abd_t *arc_get_data_abd(arc_buf_hdr_t *, uint64_t, void *);
d3c2ae1c 1080static void *arc_get_data_buf(arc_buf_hdr_t *, uint64_t, void *);
a6255b7f
DQ
1081static void arc_get_data_impl(arc_buf_hdr_t *, uint64_t, void *);
1082static void arc_free_data_abd(arc_buf_hdr_t *, abd_t *, uint64_t, void *);
d3c2ae1c 1083static void arc_free_data_buf(arc_buf_hdr_t *, void *, uint64_t, void *);
a6255b7f 1084static void arc_free_data_impl(arc_buf_hdr_t *hdr, uint64_t size, void *tag);
b5256303
TC
1085static void arc_hdr_free_abd(arc_buf_hdr_t *, boolean_t);
1086static void arc_hdr_alloc_abd(arc_buf_hdr_t *, boolean_t);
2a432414 1087static void arc_access(arc_buf_hdr_t *, kmutex_t *);
ca0bf58d 1088static boolean_t arc_is_overflowing(void);
2a432414 1089static void arc_buf_watch(arc_buf_t *);
ca67b33a 1090static void arc_tuning_update(void);
25458cbe 1091static void arc_prune_async(int64_t);
9edb3695 1092static uint64_t arc_all_memory(void);
2a432414 1093
b9541d6b
CW
1094static arc_buf_contents_t arc_buf_type(arc_buf_hdr_t *);
1095static uint32_t arc_bufc_to_flags(arc_buf_contents_t);
d3c2ae1c
GW
1096static inline void arc_hdr_set_flags(arc_buf_hdr_t *hdr, arc_flags_t flags);
1097static inline void arc_hdr_clear_flags(arc_buf_hdr_t *hdr, arc_flags_t flags);
b9541d6b 1098
2a432414
GW
1099static boolean_t l2arc_write_eligible(uint64_t, arc_buf_hdr_t *);
1100static void l2arc_read_done(zio_t *);
34dc7c2f 1101
37fb3e43
PD
1102
1103/*
1104 * We use Cityhash for this. It's fast, and has good hash properties without
1105 * requiring any large static buffers.
1106 */
34dc7c2f 1107static uint64_t
d164b209 1108buf_hash(uint64_t spa, const dva_t *dva, uint64_t birth)
34dc7c2f 1109{
37fb3e43 1110 return (cityhash4(spa, dva->dva_word[0], dva->dva_word[1], birth));
34dc7c2f
BB
1111}
1112
d3c2ae1c
GW
1113#define HDR_EMPTY(hdr) \
1114 ((hdr)->b_dva.dva_word[0] == 0 && \
1115 (hdr)->b_dva.dva_word[1] == 0)
34dc7c2f 1116
d3c2ae1c
GW
1117#define HDR_EQUAL(spa, dva, birth, hdr) \
1118 ((hdr)->b_dva.dva_word[0] == (dva)->dva_word[0]) && \
1119 ((hdr)->b_dva.dva_word[1] == (dva)->dva_word[1]) && \
1120 ((hdr)->b_birth == birth) && ((hdr)->b_spa == spa)
34dc7c2f 1121
428870ff
BB
1122static void
1123buf_discard_identity(arc_buf_hdr_t *hdr)
1124{
1125 hdr->b_dva.dva_word[0] = 0;
1126 hdr->b_dva.dva_word[1] = 0;
1127 hdr->b_birth = 0;
428870ff
BB
1128}
1129
34dc7c2f 1130static arc_buf_hdr_t *
9b67f605 1131buf_hash_find(uint64_t spa, const blkptr_t *bp, kmutex_t **lockp)
34dc7c2f 1132{
9b67f605
MA
1133 const dva_t *dva = BP_IDENTITY(bp);
1134 uint64_t birth = BP_PHYSICAL_BIRTH(bp);
34dc7c2f
BB
1135 uint64_t idx = BUF_HASH_INDEX(spa, dva, birth);
1136 kmutex_t *hash_lock = BUF_HASH_LOCK(idx);
2a432414 1137 arc_buf_hdr_t *hdr;
34dc7c2f
BB
1138
1139 mutex_enter(hash_lock);
2a432414
GW
1140 for (hdr = buf_hash_table.ht_table[idx]; hdr != NULL;
1141 hdr = hdr->b_hash_next) {
d3c2ae1c 1142 if (HDR_EQUAL(spa, dva, birth, hdr)) {
34dc7c2f 1143 *lockp = hash_lock;
2a432414 1144 return (hdr);
34dc7c2f
BB
1145 }
1146 }
1147 mutex_exit(hash_lock);
1148 *lockp = NULL;
1149 return (NULL);
1150}
1151
1152/*
1153 * Insert an entry into the hash table. If there is already an element
1154 * equal to elem in the hash table, then the already existing element
1155 * will be returned and the new element will not be inserted.
1156 * Otherwise returns NULL.
b9541d6b 1157 * If lockp == NULL, the caller is assumed to already hold the hash lock.
34dc7c2f
BB
1158 */
1159static arc_buf_hdr_t *
2a432414 1160buf_hash_insert(arc_buf_hdr_t *hdr, kmutex_t **lockp)
34dc7c2f 1161{
2a432414 1162 uint64_t idx = BUF_HASH_INDEX(hdr->b_spa, &hdr->b_dva, hdr->b_birth);
34dc7c2f 1163 kmutex_t *hash_lock = BUF_HASH_LOCK(idx);
2a432414 1164 arc_buf_hdr_t *fhdr;
34dc7c2f
BB
1165 uint32_t i;
1166
2a432414
GW
1167 ASSERT(!DVA_IS_EMPTY(&hdr->b_dva));
1168 ASSERT(hdr->b_birth != 0);
1169 ASSERT(!HDR_IN_HASH_TABLE(hdr));
b9541d6b
CW
1170
1171 if (lockp != NULL) {
1172 *lockp = hash_lock;
1173 mutex_enter(hash_lock);
1174 } else {
1175 ASSERT(MUTEX_HELD(hash_lock));
1176 }
1177
2a432414
GW
1178 for (fhdr = buf_hash_table.ht_table[idx], i = 0; fhdr != NULL;
1179 fhdr = fhdr->b_hash_next, i++) {
d3c2ae1c 1180 if (HDR_EQUAL(hdr->b_spa, &hdr->b_dva, hdr->b_birth, fhdr))
2a432414 1181 return (fhdr);
34dc7c2f
BB
1182 }
1183
2a432414
GW
1184 hdr->b_hash_next = buf_hash_table.ht_table[idx];
1185 buf_hash_table.ht_table[idx] = hdr;
d3c2ae1c 1186 arc_hdr_set_flags(hdr, ARC_FLAG_IN_HASH_TABLE);
34dc7c2f
BB
1187
1188 /* collect some hash table performance data */
1189 if (i > 0) {
1190 ARCSTAT_BUMP(arcstat_hash_collisions);
1191 if (i == 1)
1192 ARCSTAT_BUMP(arcstat_hash_chains);
1193
1194 ARCSTAT_MAX(arcstat_hash_chain_max, i);
1195 }
1196
1197 ARCSTAT_BUMP(arcstat_hash_elements);
1198 ARCSTAT_MAXSTAT(arcstat_hash_elements);
1199
1200 return (NULL);
1201}
1202
1203static void
2a432414 1204buf_hash_remove(arc_buf_hdr_t *hdr)
34dc7c2f 1205{
2a432414
GW
1206 arc_buf_hdr_t *fhdr, **hdrp;
1207 uint64_t idx = BUF_HASH_INDEX(hdr->b_spa, &hdr->b_dva, hdr->b_birth);
34dc7c2f
BB
1208
1209 ASSERT(MUTEX_HELD(BUF_HASH_LOCK(idx)));
2a432414 1210 ASSERT(HDR_IN_HASH_TABLE(hdr));
34dc7c2f 1211
2a432414
GW
1212 hdrp = &buf_hash_table.ht_table[idx];
1213 while ((fhdr = *hdrp) != hdr) {
d3c2ae1c 1214 ASSERT3P(fhdr, !=, NULL);
2a432414 1215 hdrp = &fhdr->b_hash_next;
34dc7c2f 1216 }
2a432414
GW
1217 *hdrp = hdr->b_hash_next;
1218 hdr->b_hash_next = NULL;
d3c2ae1c 1219 arc_hdr_clear_flags(hdr, ARC_FLAG_IN_HASH_TABLE);
34dc7c2f
BB
1220
1221 /* collect some hash table performance data */
1222 ARCSTAT_BUMPDOWN(arcstat_hash_elements);
1223
1224 if (buf_hash_table.ht_table[idx] &&
1225 buf_hash_table.ht_table[idx]->b_hash_next == NULL)
1226 ARCSTAT_BUMPDOWN(arcstat_hash_chains);
1227}
1228
1229/*
1230 * Global data structures and functions for the buf kmem cache.
1231 */
b5256303 1232
b9541d6b 1233static kmem_cache_t *hdr_full_cache;
b5256303 1234static kmem_cache_t *hdr_full_crypt_cache;
b9541d6b 1235static kmem_cache_t *hdr_l2only_cache;
34dc7c2f
BB
1236static kmem_cache_t *buf_cache;
1237
1238static void
1239buf_fini(void)
1240{
1241 int i;
1242
93ce2b4c 1243#if defined(_KERNEL)
d1d7e268
MK
1244 /*
1245 * Large allocations which do not require contiguous pages
1246 * should be using vmem_free() in the linux kernel\
1247 */
00b46022
BB
1248 vmem_free(buf_hash_table.ht_table,
1249 (buf_hash_table.ht_mask + 1) * sizeof (void *));
1250#else
34dc7c2f
BB
1251 kmem_free(buf_hash_table.ht_table,
1252 (buf_hash_table.ht_mask + 1) * sizeof (void *));
00b46022 1253#endif
34dc7c2f
BB
1254 for (i = 0; i < BUF_LOCKS; i++)
1255 mutex_destroy(&buf_hash_table.ht_locks[i].ht_lock);
b9541d6b 1256 kmem_cache_destroy(hdr_full_cache);
b5256303 1257 kmem_cache_destroy(hdr_full_crypt_cache);
b9541d6b 1258 kmem_cache_destroy(hdr_l2only_cache);
34dc7c2f
BB
1259 kmem_cache_destroy(buf_cache);
1260}
1261
1262/*
1263 * Constructor callback - called when the cache is empty
1264 * and a new buf is requested.
1265 */
1266/* ARGSUSED */
1267static int
b9541d6b
CW
1268hdr_full_cons(void *vbuf, void *unused, int kmflag)
1269{
1270 arc_buf_hdr_t *hdr = vbuf;
1271
1272 bzero(hdr, HDR_FULL_SIZE);
ae76f45c 1273 hdr->b_l1hdr.b_byteswap = DMU_BSWAP_NUMFUNCS;
b9541d6b
CW
1274 cv_init(&hdr->b_l1hdr.b_cv, NULL, CV_DEFAULT, NULL);
1275 refcount_create(&hdr->b_l1hdr.b_refcnt);
1276 mutex_init(&hdr->b_l1hdr.b_freeze_lock, NULL, MUTEX_DEFAULT, NULL);
1277 list_link_init(&hdr->b_l1hdr.b_arc_node);
1278 list_link_init(&hdr->b_l2hdr.b_l2node);
ca0bf58d 1279 multilist_link_init(&hdr->b_l1hdr.b_arc_node);
b9541d6b
CW
1280 arc_space_consume(HDR_FULL_SIZE, ARC_SPACE_HDRS);
1281
1282 return (0);
1283}
1284
b5256303
TC
1285/* ARGSUSED */
1286static int
1287hdr_full_crypt_cons(void *vbuf, void *unused, int kmflag)
1288{
1289 arc_buf_hdr_t *hdr = vbuf;
1290
1291 hdr_full_cons(vbuf, unused, kmflag);
1292 bzero(&hdr->b_crypt_hdr, sizeof (hdr->b_crypt_hdr));
1293 arc_space_consume(sizeof (hdr->b_crypt_hdr), ARC_SPACE_HDRS);
1294
1295 return (0);
1296}
1297
b9541d6b
CW
1298/* ARGSUSED */
1299static int
1300hdr_l2only_cons(void *vbuf, void *unused, int kmflag)
34dc7c2f 1301{
2a432414
GW
1302 arc_buf_hdr_t *hdr = vbuf;
1303
b9541d6b
CW
1304 bzero(hdr, HDR_L2ONLY_SIZE);
1305 arc_space_consume(HDR_L2ONLY_SIZE, ARC_SPACE_L2HDRS);
34dc7c2f 1306
34dc7c2f
BB
1307 return (0);
1308}
1309
b128c09f
BB
1310/* ARGSUSED */
1311static int
1312buf_cons(void *vbuf, void *unused, int kmflag)
1313{
1314 arc_buf_t *buf = vbuf;
1315
1316 bzero(buf, sizeof (arc_buf_t));
428870ff 1317 mutex_init(&buf->b_evict_lock, NULL, MUTEX_DEFAULT, NULL);
d164b209
BB
1318 arc_space_consume(sizeof (arc_buf_t), ARC_SPACE_HDRS);
1319
b128c09f
BB
1320 return (0);
1321}
1322
34dc7c2f
BB
1323/*
1324 * Destructor callback - called when a cached buf is
1325 * no longer required.
1326 */
1327/* ARGSUSED */
1328static void
b9541d6b 1329hdr_full_dest(void *vbuf, void *unused)
34dc7c2f 1330{
2a432414 1331 arc_buf_hdr_t *hdr = vbuf;
34dc7c2f 1332
d3c2ae1c 1333 ASSERT(HDR_EMPTY(hdr));
b9541d6b
CW
1334 cv_destroy(&hdr->b_l1hdr.b_cv);
1335 refcount_destroy(&hdr->b_l1hdr.b_refcnt);
1336 mutex_destroy(&hdr->b_l1hdr.b_freeze_lock);
ca0bf58d 1337 ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node));
b9541d6b
CW
1338 arc_space_return(HDR_FULL_SIZE, ARC_SPACE_HDRS);
1339}
1340
b5256303
TC
1341/* ARGSUSED */
1342static void
1343hdr_full_crypt_dest(void *vbuf, void *unused)
1344{
1345 arc_buf_hdr_t *hdr = vbuf;
1346
1347 hdr_full_dest(vbuf, unused);
1348 arc_space_return(sizeof (hdr->b_crypt_hdr), ARC_SPACE_HDRS);
1349}
1350
b9541d6b
CW
1351/* ARGSUSED */
1352static void
1353hdr_l2only_dest(void *vbuf, void *unused)
1354{
1355 ASSERTV(arc_buf_hdr_t *hdr = vbuf);
1356
d3c2ae1c 1357 ASSERT(HDR_EMPTY(hdr));
b9541d6b 1358 arc_space_return(HDR_L2ONLY_SIZE, ARC_SPACE_L2HDRS);
34dc7c2f
BB
1359}
1360
b128c09f
BB
1361/* ARGSUSED */
1362static void
1363buf_dest(void *vbuf, void *unused)
1364{
1365 arc_buf_t *buf = vbuf;
1366
428870ff 1367 mutex_destroy(&buf->b_evict_lock);
d164b209 1368 arc_space_return(sizeof (arc_buf_t), ARC_SPACE_HDRS);
b128c09f
BB
1369}
1370
8c8af9d8
BB
1371/*
1372 * Reclaim callback -- invoked when memory is low.
1373 */
1374/* ARGSUSED */
1375static void
1376hdr_recl(void *unused)
1377{
1378 dprintf("hdr_recl called\n");
1379 /*
1380 * umem calls the reclaim func when we destroy the buf cache,
1381 * which is after we do arc_fini().
1382 */
1383 if (!arc_dead)
1384 cv_signal(&arc_reclaim_thread_cv);
1385}
1386
34dc7c2f
BB
1387static void
1388buf_init(void)
1389{
2db28197 1390 uint64_t *ct = NULL;
34dc7c2f
BB
1391 uint64_t hsize = 1ULL << 12;
1392 int i, j;
1393
1394 /*
1395 * The hash table is big enough to fill all of physical memory
49ddb315
MA
1396 * with an average block size of zfs_arc_average_blocksize (default 8K).
1397 * By default, the table will take up
1398 * totalmem * sizeof(void*) / 8K (1MB per GB with 8-byte pointers).
34dc7c2f 1399 */
9edb3695 1400 while (hsize * zfs_arc_average_blocksize < arc_all_memory())
34dc7c2f
BB
1401 hsize <<= 1;
1402retry:
1403 buf_hash_table.ht_mask = hsize - 1;
93ce2b4c 1404#if defined(_KERNEL)
d1d7e268
MK
1405 /*
1406 * Large allocations which do not require contiguous pages
1407 * should be using vmem_alloc() in the linux kernel
1408 */
00b46022
BB
1409 buf_hash_table.ht_table =
1410 vmem_zalloc(hsize * sizeof (void*), KM_SLEEP);
1411#else
34dc7c2f
BB
1412 buf_hash_table.ht_table =
1413 kmem_zalloc(hsize * sizeof (void*), KM_NOSLEEP);
00b46022 1414#endif
34dc7c2f
BB
1415 if (buf_hash_table.ht_table == NULL) {
1416 ASSERT(hsize > (1ULL << 8));
1417 hsize >>= 1;
1418 goto retry;
1419 }
1420
b9541d6b 1421 hdr_full_cache = kmem_cache_create("arc_buf_hdr_t_full", HDR_FULL_SIZE,
8c8af9d8 1422 0, hdr_full_cons, hdr_full_dest, hdr_recl, NULL, NULL, 0);
b5256303
TC
1423 hdr_full_crypt_cache = kmem_cache_create("arc_buf_hdr_t_full_crypt",
1424 HDR_FULL_CRYPT_SIZE, 0, hdr_full_crypt_cons, hdr_full_crypt_dest,
1425 hdr_recl, NULL, NULL, 0);
b9541d6b 1426 hdr_l2only_cache = kmem_cache_create("arc_buf_hdr_t_l2only",
8c8af9d8 1427 HDR_L2ONLY_SIZE, 0, hdr_l2only_cons, hdr_l2only_dest, hdr_recl,
b9541d6b 1428 NULL, NULL, 0);
34dc7c2f 1429 buf_cache = kmem_cache_create("arc_buf_t", sizeof (arc_buf_t),
b128c09f 1430 0, buf_cons, buf_dest, NULL, NULL, NULL, 0);
34dc7c2f
BB
1431
1432 for (i = 0; i < 256; i++)
1433 for (ct = zfs_crc64_table + i, *ct = i, j = 8; j > 0; j--)
1434 *ct = (*ct >> 1) ^ (-(*ct & 1) & ZFS_CRC64_POLY);
1435
1436 for (i = 0; i < BUF_LOCKS; i++) {
1437 mutex_init(&buf_hash_table.ht_locks[i].ht_lock,
40d06e3c 1438 NULL, MUTEX_DEFAULT, NULL);
34dc7c2f
BB
1439 }
1440}
1441
d3c2ae1c 1442#define ARC_MINTIME (hz>>4) /* 62 ms */
ca0bf58d 1443
2aa34383
DK
1444/*
1445 * This is the size that the buf occupies in memory. If the buf is compressed,
1446 * it will correspond to the compressed size. You should use this method of
1447 * getting the buf size unless you explicitly need the logical size.
1448 */
1449uint64_t
1450arc_buf_size(arc_buf_t *buf)
1451{
1452 return (ARC_BUF_COMPRESSED(buf) ?
1453 HDR_GET_PSIZE(buf->b_hdr) : HDR_GET_LSIZE(buf->b_hdr));
1454}
1455
1456uint64_t
1457arc_buf_lsize(arc_buf_t *buf)
1458{
1459 return (HDR_GET_LSIZE(buf->b_hdr));
1460}
1461
b5256303
TC
1462/*
1463 * This function will return B_TRUE if the buffer is encrypted in memory.
1464 * This buffer can be decrypted by calling arc_untransform().
1465 */
1466boolean_t
1467arc_is_encrypted(arc_buf_t *buf)
1468{
1469 return (ARC_BUF_ENCRYPTED(buf) != 0);
1470}
1471
1472/*
1473 * Returns B_TRUE if the buffer represents data that has not had its MAC
1474 * verified yet.
1475 */
1476boolean_t
1477arc_is_unauthenticated(arc_buf_t *buf)
1478{
1479 return (HDR_NOAUTH(buf->b_hdr) != 0);
1480}
1481
1482void
1483arc_get_raw_params(arc_buf_t *buf, boolean_t *byteorder, uint8_t *salt,
1484 uint8_t *iv, uint8_t *mac)
1485{
1486 arc_buf_hdr_t *hdr = buf->b_hdr;
1487
1488 ASSERT(HDR_PROTECTED(hdr));
1489
1490 bcopy(hdr->b_crypt_hdr.b_salt, salt, ZIO_DATA_SALT_LEN);
1491 bcopy(hdr->b_crypt_hdr.b_iv, iv, ZIO_DATA_IV_LEN);
1492 bcopy(hdr->b_crypt_hdr.b_mac, mac, ZIO_DATA_MAC_LEN);
1493 *byteorder = (hdr->b_l1hdr.b_byteswap == DMU_BSWAP_NUMFUNCS) ?
1494 ZFS_HOST_BYTEORDER : !ZFS_HOST_BYTEORDER;
1495}
1496
1497/*
1498 * Indicates how this buffer is compressed in memory. If it is not compressed
1499 * the value will be ZIO_COMPRESS_OFF. It can be made normally readable with
1500 * arc_untransform() as long as it is also unencrypted.
1501 */
2aa34383
DK
1502enum zio_compress
1503arc_get_compression(arc_buf_t *buf)
1504{
1505 return (ARC_BUF_COMPRESSED(buf) ?
1506 HDR_GET_COMPRESS(buf->b_hdr) : ZIO_COMPRESS_OFF);
1507}
1508
b5256303
TC
1509/*
1510 * Return the compression algorithm used to store this data in the ARC. If ARC
1511 * compression is enabled or this is an encrypted block, this will be the same
1512 * as what's used to store it on-disk. Otherwise, this will be ZIO_COMPRESS_OFF.
1513 */
1514static inline enum zio_compress
1515arc_hdr_get_compress(arc_buf_hdr_t *hdr)
1516{
1517 return (HDR_COMPRESSION_ENABLED(hdr) ?
1518 HDR_GET_COMPRESS(hdr) : ZIO_COMPRESS_OFF);
1519}
1520
d3c2ae1c
GW
1521static inline boolean_t
1522arc_buf_is_shared(arc_buf_t *buf)
1523{
1524 boolean_t shared = (buf->b_data != NULL &&
a6255b7f
DQ
1525 buf->b_hdr->b_l1hdr.b_pabd != NULL &&
1526 abd_is_linear(buf->b_hdr->b_l1hdr.b_pabd) &&
1527 buf->b_data == abd_to_buf(buf->b_hdr->b_l1hdr.b_pabd));
d3c2ae1c 1528 IMPLY(shared, HDR_SHARED_DATA(buf->b_hdr));
2aa34383
DK
1529 IMPLY(shared, ARC_BUF_SHARED(buf));
1530 IMPLY(shared, ARC_BUF_COMPRESSED(buf) || ARC_BUF_LAST(buf));
524b4217
DK
1531
1532 /*
1533 * It would be nice to assert arc_can_share() too, but the "hdr isn't
1534 * already being shared" requirement prevents us from doing that.
1535 */
1536
d3c2ae1c
GW
1537 return (shared);
1538}
ca0bf58d 1539
a7004725
DK
1540/*
1541 * Free the checksum associated with this header. If there is no checksum, this
1542 * is a no-op.
1543 */
d3c2ae1c
GW
1544static inline void
1545arc_cksum_free(arc_buf_hdr_t *hdr)
1546{
1547 ASSERT(HDR_HAS_L1HDR(hdr));
b5256303 1548
d3c2ae1c
GW
1549 mutex_enter(&hdr->b_l1hdr.b_freeze_lock);
1550 if (hdr->b_l1hdr.b_freeze_cksum != NULL) {
1551 kmem_free(hdr->b_l1hdr.b_freeze_cksum, sizeof (zio_cksum_t));
1552 hdr->b_l1hdr.b_freeze_cksum = NULL;
b9541d6b 1553 }
d3c2ae1c 1554 mutex_exit(&hdr->b_l1hdr.b_freeze_lock);
b9541d6b
CW
1555}
1556
a7004725
DK
1557/*
1558 * Return true iff at least one of the bufs on hdr is not compressed.
b5256303 1559 * Encrypted buffers count as compressed.
a7004725
DK
1560 */
1561static boolean_t
1562arc_hdr_has_uncompressed_buf(arc_buf_hdr_t *hdr)
1563{
149ce888
TC
1564 ASSERT(hdr->b_l1hdr.b_state == arc_anon ||
1565 MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr));
1566
a7004725
DK
1567 for (arc_buf_t *b = hdr->b_l1hdr.b_buf; b != NULL; b = b->b_next) {
1568 if (!ARC_BUF_COMPRESSED(b)) {
1569 return (B_TRUE);
1570 }
1571 }
1572 return (B_FALSE);
1573}
1574
1575
524b4217
DK
1576/*
1577 * If we've turned on the ZFS_DEBUG_MODIFY flag, verify that the buf's data
1578 * matches the checksum that is stored in the hdr. If there is no checksum,
1579 * or if the buf is compressed, this is a no-op.
1580 */
34dc7c2f
BB
1581static void
1582arc_cksum_verify(arc_buf_t *buf)
1583{
d3c2ae1c 1584 arc_buf_hdr_t *hdr = buf->b_hdr;
34dc7c2f
BB
1585 zio_cksum_t zc;
1586
1587 if (!(zfs_flags & ZFS_DEBUG_MODIFY))
1588 return;
1589
149ce888 1590 if (ARC_BUF_COMPRESSED(buf))
524b4217 1591 return;
524b4217 1592
d3c2ae1c
GW
1593 ASSERT(HDR_HAS_L1HDR(hdr));
1594
1595 mutex_enter(&hdr->b_l1hdr.b_freeze_lock);
149ce888 1596
d3c2ae1c
GW
1597 if (hdr->b_l1hdr.b_freeze_cksum == NULL || HDR_IO_ERROR(hdr)) {
1598 mutex_exit(&hdr->b_l1hdr.b_freeze_lock);
34dc7c2f
BB
1599 return;
1600 }
2aa34383 1601
3c67d83a 1602 fletcher_2_native(buf->b_data, arc_buf_size(buf), NULL, &zc);
d3c2ae1c 1603 if (!ZIO_CHECKSUM_EQUAL(*hdr->b_l1hdr.b_freeze_cksum, zc))
34dc7c2f 1604 panic("buffer modified while frozen!");
d3c2ae1c 1605 mutex_exit(&hdr->b_l1hdr.b_freeze_lock);
34dc7c2f
BB
1606}
1607
b5256303
TC
1608/*
1609 * This function makes the assumption that data stored in the L2ARC
1610 * will be transformed exactly as it is in the main pool. Because of
1611 * this we can verify the checksum against the reading process's bp.
1612 */
d3c2ae1c
GW
1613static boolean_t
1614arc_cksum_is_equal(arc_buf_hdr_t *hdr, zio_t *zio)
34dc7c2f 1615{
d3c2ae1c
GW
1616 ASSERT(!BP_IS_EMBEDDED(zio->io_bp));
1617 VERIFY3U(BP_GET_PSIZE(zio->io_bp), ==, HDR_GET_PSIZE(hdr));
34dc7c2f 1618
d3c2ae1c
GW
1619 /*
1620 * Block pointers always store the checksum for the logical data.
1621 * If the block pointer has the gang bit set, then the checksum
1622 * it represents is for the reconstituted data and not for an
1623 * individual gang member. The zio pipeline, however, must be able to
1624 * determine the checksum of each of the gang constituents so it
1625 * treats the checksum comparison differently than what we need
1626 * for l2arc blocks. This prevents us from using the
1627 * zio_checksum_error() interface directly. Instead we must call the
1628 * zio_checksum_error_impl() so that we can ensure the checksum is
1629 * generated using the correct checksum algorithm and accounts for the
1630 * logical I/O size and not just a gang fragment.
1631 */
b5256303 1632 return (zio_checksum_error_impl(zio->io_spa, zio->io_bp,
a6255b7f 1633 BP_GET_CHECKSUM(zio->io_bp), zio->io_abd, zio->io_size,
d3c2ae1c 1634 zio->io_offset, NULL) == 0);
34dc7c2f
BB
1635}
1636
524b4217
DK
1637/*
1638 * Given a buf full of data, if ZFS_DEBUG_MODIFY is enabled this computes a
1639 * checksum and attaches it to the buf's hdr so that we can ensure that the buf
1640 * isn't modified later on. If buf is compressed or there is already a checksum
1641 * on the hdr, this is a no-op (we only checksum uncompressed bufs).
1642 */
34dc7c2f 1643static void
d3c2ae1c 1644arc_cksum_compute(arc_buf_t *buf)
34dc7c2f 1645{
d3c2ae1c
GW
1646 arc_buf_hdr_t *hdr = buf->b_hdr;
1647
1648 if (!(zfs_flags & ZFS_DEBUG_MODIFY))
34dc7c2f
BB
1649 return;
1650
d3c2ae1c 1651 ASSERT(HDR_HAS_L1HDR(hdr));
2aa34383 1652
b9541d6b 1653 mutex_enter(&buf->b_hdr->b_l1hdr.b_freeze_lock);
149ce888 1654 if (hdr->b_l1hdr.b_freeze_cksum != NULL || ARC_BUF_COMPRESSED(buf)) {
d3c2ae1c 1655 mutex_exit(&hdr->b_l1hdr.b_freeze_lock);
34dc7c2f
BB
1656 return;
1657 }
2aa34383 1658
b5256303 1659 ASSERT(!ARC_BUF_ENCRYPTED(buf));
2aa34383 1660 ASSERT(!ARC_BUF_COMPRESSED(buf));
d3c2ae1c
GW
1661 hdr->b_l1hdr.b_freeze_cksum = kmem_alloc(sizeof (zio_cksum_t),
1662 KM_SLEEP);
3c67d83a 1663 fletcher_2_native(buf->b_data, arc_buf_size(buf), NULL,
d3c2ae1c
GW
1664 hdr->b_l1hdr.b_freeze_cksum);
1665 mutex_exit(&hdr->b_l1hdr.b_freeze_lock);
498877ba
MA
1666 arc_buf_watch(buf);
1667}
1668
1669#ifndef _KERNEL
1670void
1671arc_buf_sigsegv(int sig, siginfo_t *si, void *unused)
1672{
02730c33 1673 panic("Got SIGSEGV at address: 0x%lx\n", (long)si->si_addr);
498877ba
MA
1674}
1675#endif
1676
1677/* ARGSUSED */
1678static void
1679arc_buf_unwatch(arc_buf_t *buf)
1680{
1681#ifndef _KERNEL
1682 if (arc_watch) {
a7004725 1683 ASSERT0(mprotect(buf->b_data, arc_buf_size(buf),
498877ba
MA
1684 PROT_READ | PROT_WRITE));
1685 }
1686#endif
1687}
1688
1689/* ARGSUSED */
1690static void
1691arc_buf_watch(arc_buf_t *buf)
1692{
1693#ifndef _KERNEL
1694 if (arc_watch)
2aa34383 1695 ASSERT0(mprotect(buf->b_data, arc_buf_size(buf),
d3c2ae1c 1696 PROT_READ));
498877ba 1697#endif
34dc7c2f
BB
1698}
1699
b9541d6b
CW
1700static arc_buf_contents_t
1701arc_buf_type(arc_buf_hdr_t *hdr)
1702{
d3c2ae1c 1703 arc_buf_contents_t type;
b9541d6b 1704 if (HDR_ISTYPE_METADATA(hdr)) {
d3c2ae1c 1705 type = ARC_BUFC_METADATA;
b9541d6b 1706 } else {
d3c2ae1c 1707 type = ARC_BUFC_DATA;
b9541d6b 1708 }
d3c2ae1c
GW
1709 VERIFY3U(hdr->b_type, ==, type);
1710 return (type);
b9541d6b
CW
1711}
1712
2aa34383
DK
1713boolean_t
1714arc_is_metadata(arc_buf_t *buf)
1715{
1716 return (HDR_ISTYPE_METADATA(buf->b_hdr) != 0);
1717}
1718
b9541d6b
CW
1719static uint32_t
1720arc_bufc_to_flags(arc_buf_contents_t type)
1721{
1722 switch (type) {
1723 case ARC_BUFC_DATA:
1724 /* metadata field is 0 if buffer contains normal data */
1725 return (0);
1726 case ARC_BUFC_METADATA:
1727 return (ARC_FLAG_BUFC_METADATA);
1728 default:
1729 break;
1730 }
1731 panic("undefined ARC buffer type!");
1732 return ((uint32_t)-1);
1733}
1734
34dc7c2f
BB
1735void
1736arc_buf_thaw(arc_buf_t *buf)
1737{
d3c2ae1c
GW
1738 arc_buf_hdr_t *hdr = buf->b_hdr;
1739
2aa34383
DK
1740 ASSERT3P(hdr->b_l1hdr.b_state, ==, arc_anon);
1741 ASSERT(!HDR_IO_IN_PROGRESS(hdr));
1742
524b4217 1743 arc_cksum_verify(buf);
34dc7c2f 1744
2aa34383 1745 /*
149ce888 1746 * Compressed buffers do not manipulate the b_freeze_cksum.
2aa34383 1747 */
149ce888 1748 if (ARC_BUF_COMPRESSED(buf))
2aa34383 1749 return;
2aa34383 1750
d3c2ae1c
GW
1751 ASSERT(HDR_HAS_L1HDR(hdr));
1752 arc_cksum_free(hdr);
498877ba 1753 arc_buf_unwatch(buf);
34dc7c2f
BB
1754}
1755
1756void
1757arc_buf_freeze(arc_buf_t *buf)
1758{
1759 if (!(zfs_flags & ZFS_DEBUG_MODIFY))
1760 return;
1761
149ce888 1762 if (ARC_BUF_COMPRESSED(buf))
2aa34383 1763 return;
428870ff 1764
149ce888 1765 ASSERT(HDR_HAS_L1HDR(buf->b_hdr));
d3c2ae1c 1766 arc_cksum_compute(buf);
34dc7c2f
BB
1767}
1768
d3c2ae1c
GW
1769/*
1770 * The arc_buf_hdr_t's b_flags should never be modified directly. Instead,
1771 * the following functions should be used to ensure that the flags are
1772 * updated in a thread-safe way. When manipulating the flags either
1773 * the hash_lock must be held or the hdr must be undiscoverable. This
1774 * ensures that we're not racing with any other threads when updating
1775 * the flags.
1776 */
1777static inline void
1778arc_hdr_set_flags(arc_buf_hdr_t *hdr, arc_flags_t flags)
1779{
1780 ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr));
1781 hdr->b_flags |= flags;
1782}
1783
1784static inline void
1785arc_hdr_clear_flags(arc_buf_hdr_t *hdr, arc_flags_t flags)
1786{
1787 ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr));
1788 hdr->b_flags &= ~flags;
1789}
1790
1791/*
1792 * Setting the compression bits in the arc_buf_hdr_t's b_flags is
1793 * done in a special way since we have to clear and set bits
1794 * at the same time. Consumers that wish to set the compression bits
1795 * must use this function to ensure that the flags are updated in
1796 * thread-safe manner.
1797 */
1798static void
1799arc_hdr_set_compress(arc_buf_hdr_t *hdr, enum zio_compress cmp)
1800{
1801 ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr));
1802
1803 /*
1804 * Holes and embedded blocks will always have a psize = 0 so
1805 * we ignore the compression of the blkptr and set the
d3c2ae1c
GW
1806 * want to uncompress them. Mark them as uncompressed.
1807 */
1808 if (!zfs_compressed_arc_enabled || HDR_GET_PSIZE(hdr) == 0) {
1809 arc_hdr_clear_flags(hdr, ARC_FLAG_COMPRESSED_ARC);
d3c2ae1c 1810 ASSERT(!HDR_COMPRESSION_ENABLED(hdr));
d3c2ae1c
GW
1811 } else {
1812 arc_hdr_set_flags(hdr, ARC_FLAG_COMPRESSED_ARC);
d3c2ae1c
GW
1813 ASSERT(HDR_COMPRESSION_ENABLED(hdr));
1814 }
b5256303
TC
1815
1816 HDR_SET_COMPRESS(hdr, cmp);
1817 ASSERT3U(HDR_GET_COMPRESS(hdr), ==, cmp);
d3c2ae1c
GW
1818}
1819
524b4217
DK
1820/*
1821 * Looks for another buf on the same hdr which has the data decompressed, copies
1822 * from it, and returns true. If no such buf exists, returns false.
1823 */
1824static boolean_t
1825arc_buf_try_copy_decompressed_data(arc_buf_t *buf)
1826{
1827 arc_buf_hdr_t *hdr = buf->b_hdr;
524b4217
DK
1828 boolean_t copied = B_FALSE;
1829
1830 ASSERT(HDR_HAS_L1HDR(hdr));
1831 ASSERT3P(buf->b_data, !=, NULL);
1832 ASSERT(!ARC_BUF_COMPRESSED(buf));
1833
a7004725 1834 for (arc_buf_t *from = hdr->b_l1hdr.b_buf; from != NULL;
524b4217
DK
1835 from = from->b_next) {
1836 /* can't use our own data buffer */
1837 if (from == buf) {
1838 continue;
1839 }
1840
1841 if (!ARC_BUF_COMPRESSED(from)) {
1842 bcopy(from->b_data, buf->b_data, arc_buf_size(buf));
1843 copied = B_TRUE;
1844 break;
1845 }
1846 }
1847
1848 /*
1849 * There were no decompressed bufs, so there should not be a
1850 * checksum on the hdr either.
1851 */
1852 EQUIV(!copied, hdr->b_l1hdr.b_freeze_cksum == NULL);
1853
1854 return (copied);
1855}
1856
b5256303
TC
1857/*
1858 * Return the size of the block, b_pabd, that is stored in the arc_buf_hdr_t.
1859 */
1860static uint64_t
1861arc_hdr_size(arc_buf_hdr_t *hdr)
1862{
1863 uint64_t size;
1864
1865 if (arc_hdr_get_compress(hdr) != ZIO_COMPRESS_OFF &&
1866 HDR_GET_PSIZE(hdr) > 0) {
1867 size = HDR_GET_PSIZE(hdr);
1868 } else {
1869 ASSERT3U(HDR_GET_LSIZE(hdr), !=, 0);
1870 size = HDR_GET_LSIZE(hdr);
1871 }
1872 return (size);
1873}
1874
1875static int
1876arc_hdr_authenticate(arc_buf_hdr_t *hdr, spa_t *spa, uint64_t dsobj)
1877{
1878 int ret;
1879 uint64_t csize;
1880 uint64_t lsize = HDR_GET_LSIZE(hdr);
1881 uint64_t psize = HDR_GET_PSIZE(hdr);
1882 void *tmpbuf = NULL;
1883 abd_t *abd = hdr->b_l1hdr.b_pabd;
1884
149ce888 1885 ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr));
b5256303
TC
1886 ASSERT(HDR_AUTHENTICATED(hdr));
1887 ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL);
1888
1889 /*
1890 * The MAC is calculated on the compressed data that is stored on disk.
1891 * However, if compressed arc is disabled we will only have the
1892 * decompressed data available to us now. Compress it into a temporary
1893 * abd so we can verify the MAC. The performance overhead of this will
1894 * be relatively low, since most objects in an encrypted objset will
1895 * be encrypted (instead of authenticated) anyway.
1896 */
1897 if (HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF &&
1898 !HDR_COMPRESSION_ENABLED(hdr)) {
1899 tmpbuf = zio_buf_alloc(lsize);
1900 abd = abd_get_from_buf(tmpbuf, lsize);
1901 abd_take_ownership_of_buf(abd, B_TRUE);
1902
1903 csize = zio_compress_data(HDR_GET_COMPRESS(hdr),
1904 hdr->b_l1hdr.b_pabd, tmpbuf, lsize);
1905 ASSERT3U(csize, <=, psize);
1906 abd_zero_off(abd, csize, psize - csize);
1907 }
1908
1909 /*
1910 * Authentication is best effort. We authenticate whenever the key is
1911 * available. If we succeed we clear ARC_FLAG_NOAUTH.
1912 */
1913 if (hdr->b_crypt_hdr.b_ot == DMU_OT_OBJSET) {
1914 ASSERT3U(HDR_GET_COMPRESS(hdr), ==, ZIO_COMPRESS_OFF);
1915 ASSERT3U(lsize, ==, psize);
1916 ret = spa_do_crypt_objset_mac_abd(B_FALSE, spa, dsobj, abd,
1917 psize, hdr->b_l1hdr.b_byteswap != DMU_BSWAP_NUMFUNCS);
1918 } else {
1919 ret = spa_do_crypt_mac_abd(B_FALSE, spa, dsobj, abd, psize,
1920 hdr->b_crypt_hdr.b_mac);
1921 }
1922
1923 if (ret == 0)
1924 arc_hdr_clear_flags(hdr, ARC_FLAG_NOAUTH);
1925 else if (ret != ENOENT)
1926 goto error;
1927
1928 if (tmpbuf != NULL)
1929 abd_free(abd);
1930
1931 return (0);
1932
1933error:
1934 if (tmpbuf != NULL)
1935 abd_free(abd);
1936
1937 return (ret);
1938}
1939
1940/*
1941 * This function will take a header that only has raw encrypted data in
1942 * b_crypt_hdr.b_rabd and decrypt it into a new buffer which is stored in
1943 * b_l1hdr.b_pabd. If designated in the header flags, this function will
1944 * also decompress the data.
1945 */
1946static int
be9a5c35 1947arc_hdr_decrypt(arc_buf_hdr_t *hdr, spa_t *spa, const zbookmark_phys_t *zb)
b5256303
TC
1948{
1949 int ret;
b5256303
TC
1950 abd_t *cabd = NULL;
1951 void *tmp = NULL;
1952 boolean_t no_crypt = B_FALSE;
1953 boolean_t bswap = (hdr->b_l1hdr.b_byteswap != DMU_BSWAP_NUMFUNCS);
1954
149ce888 1955 ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr));
b5256303
TC
1956 ASSERT(HDR_ENCRYPTED(hdr));
1957
1958 arc_hdr_alloc_abd(hdr, B_FALSE);
1959
be9a5c35
TC
1960 ret = spa_do_crypt_abd(B_FALSE, spa, zb, hdr->b_crypt_hdr.b_ot,
1961 B_FALSE, bswap, hdr->b_crypt_hdr.b_salt, hdr->b_crypt_hdr.b_iv,
1962 hdr->b_crypt_hdr.b_mac, HDR_GET_PSIZE(hdr), hdr->b_l1hdr.b_pabd,
b5256303
TC
1963 hdr->b_crypt_hdr.b_rabd, &no_crypt);
1964 if (ret != 0)
1965 goto error;
1966
1967 if (no_crypt) {
1968 abd_copy(hdr->b_l1hdr.b_pabd, hdr->b_crypt_hdr.b_rabd,
1969 HDR_GET_PSIZE(hdr));
1970 }
1971
1972 /*
1973 * If this header has disabled arc compression but the b_pabd is
1974 * compressed after decrypting it, we need to decompress the newly
1975 * decrypted data.
1976 */
1977 if (HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF &&
1978 !HDR_COMPRESSION_ENABLED(hdr)) {
1979 /*
1980 * We want to make sure that we are correctly honoring the
1981 * zfs_abd_scatter_enabled setting, so we allocate an abd here
1982 * and then loan a buffer from it, rather than allocating a
1983 * linear buffer and wrapping it in an abd later.
1984 */
1985 cabd = arc_get_data_abd(hdr, arc_hdr_size(hdr), hdr);
1986 tmp = abd_borrow_buf(cabd, arc_hdr_size(hdr));
1987
1988 ret = zio_decompress_data(HDR_GET_COMPRESS(hdr),
1989 hdr->b_l1hdr.b_pabd, tmp, HDR_GET_PSIZE(hdr),
1990 HDR_GET_LSIZE(hdr));
1991 if (ret != 0) {
1992 abd_return_buf(cabd, tmp, arc_hdr_size(hdr));
1993 goto error;
1994 }
1995
1996 abd_return_buf_copy(cabd, tmp, arc_hdr_size(hdr));
1997 arc_free_data_abd(hdr, hdr->b_l1hdr.b_pabd,
1998 arc_hdr_size(hdr), hdr);
1999 hdr->b_l1hdr.b_pabd = cabd;
2000 }
2001
b5256303
TC
2002 return (0);
2003
2004error:
2005 arc_hdr_free_abd(hdr, B_FALSE);
b5256303
TC
2006 if (cabd != NULL)
2007 arc_free_data_buf(hdr, cabd, arc_hdr_size(hdr), hdr);
2008
2009 return (ret);
2010}
2011
2012/*
2013 * This function is called during arc_buf_fill() to prepare the header's
2014 * abd plaintext pointer for use. This involves authenticated protected
2015 * data and decrypting encrypted data into the plaintext abd.
2016 */
2017static int
2018arc_fill_hdr_crypt(arc_buf_hdr_t *hdr, kmutex_t *hash_lock, spa_t *spa,
be9a5c35 2019 const zbookmark_phys_t *zb, boolean_t noauth)
b5256303
TC
2020{
2021 int ret;
2022
2023 ASSERT(HDR_PROTECTED(hdr));
2024
2025 if (hash_lock != NULL)
2026 mutex_enter(hash_lock);
2027
2028 if (HDR_NOAUTH(hdr) && !noauth) {
2029 /*
2030 * The caller requested authenticated data but our data has
2031 * not been authenticated yet. Verify the MAC now if we can.
2032 */
be9a5c35 2033 ret = arc_hdr_authenticate(hdr, spa, zb->zb_objset);
b5256303
TC
2034 if (ret != 0)
2035 goto error;
2036 } else if (HDR_HAS_RABD(hdr) && hdr->b_l1hdr.b_pabd == NULL) {
2037 /*
2038 * If we only have the encrypted version of the data, but the
2039 * unencrypted version was requested we take this opportunity
2040 * to store the decrypted version in the header for future use.
2041 */
be9a5c35 2042 ret = arc_hdr_decrypt(hdr, spa, zb);
b5256303
TC
2043 if (ret != 0)
2044 goto error;
2045 }
2046
2047 ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL);
2048
2049 if (hash_lock != NULL)
2050 mutex_exit(hash_lock);
2051
2052 return (0);
2053
2054error:
2055 if (hash_lock != NULL)
2056 mutex_exit(hash_lock);
2057
2058 return (ret);
2059}
2060
2061/*
2062 * This function is used by the dbuf code to decrypt bonus buffers in place.
2063 * The dbuf code itself doesn't have any locking for decrypting a shared dnode
2064 * block, so we use the hash lock here to protect against concurrent calls to
2065 * arc_buf_fill().
2066 */
2067static void
2068arc_buf_untransform_in_place(arc_buf_t *buf, kmutex_t *hash_lock)
2069{
2070 arc_buf_hdr_t *hdr = buf->b_hdr;
2071
2072 ASSERT(HDR_ENCRYPTED(hdr));
2073 ASSERT3U(hdr->b_crypt_hdr.b_ot, ==, DMU_OT_DNODE);
149ce888 2074 ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr));
b5256303
TC
2075 ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL);
2076
2077 zio_crypt_copy_dnode_bonus(hdr->b_l1hdr.b_pabd, buf->b_data,
2078 arc_buf_size(buf));
2079 buf->b_flags &= ~ARC_BUF_FLAG_ENCRYPTED;
2080 buf->b_flags &= ~ARC_BUF_FLAG_COMPRESSED;
2081 hdr->b_crypt_hdr.b_ebufcnt -= 1;
2082}
2083
524b4217
DK
2084/*
2085 * Given a buf that has a data buffer attached to it, this function will
2086 * efficiently fill the buf with data of the specified compression setting from
2087 * the hdr and update the hdr's b_freeze_cksum if necessary. If the buf and hdr
2088 * are already sharing a data buf, no copy is performed.
2089 *
2090 * If the buf is marked as compressed but uncompressed data was requested, this
2091 * will allocate a new data buffer for the buf, remove that flag, and fill the
2092 * buf with uncompressed data. You can't request a compressed buf on a hdr with
2093 * uncompressed data, and (since we haven't added support for it yet) if you
2094 * want compressed data your buf must already be marked as compressed and have
2095 * the correct-sized data buffer.
2096 */
2097static int
be9a5c35
TC
2098arc_buf_fill(arc_buf_t *buf, spa_t *spa, const zbookmark_phys_t *zb,
2099 arc_fill_flags_t flags)
d3c2ae1c 2100{
b5256303 2101 int error = 0;
d3c2ae1c 2102 arc_buf_hdr_t *hdr = buf->b_hdr;
b5256303
TC
2103 boolean_t hdr_compressed =
2104 (arc_hdr_get_compress(hdr) != ZIO_COMPRESS_OFF);
2105 boolean_t compressed = (flags & ARC_FILL_COMPRESSED) != 0;
2106 boolean_t encrypted = (flags & ARC_FILL_ENCRYPTED) != 0;
d3c2ae1c 2107 dmu_object_byteswap_t bswap = hdr->b_l1hdr.b_byteswap;
b5256303 2108 kmutex_t *hash_lock = (flags & ARC_FILL_LOCKED) ? NULL : HDR_LOCK(hdr);
d3c2ae1c 2109
524b4217 2110 ASSERT3P(buf->b_data, !=, NULL);
b5256303 2111 IMPLY(compressed, hdr_compressed || ARC_BUF_ENCRYPTED(buf));
524b4217 2112 IMPLY(compressed, ARC_BUF_COMPRESSED(buf));
b5256303
TC
2113 IMPLY(encrypted, HDR_ENCRYPTED(hdr));
2114 IMPLY(encrypted, ARC_BUF_ENCRYPTED(buf));
2115 IMPLY(encrypted, ARC_BUF_COMPRESSED(buf));
2116 IMPLY(encrypted, !ARC_BUF_SHARED(buf));
2117
2118 /*
2119 * If the caller wanted encrypted data we just need to copy it from
2120 * b_rabd and potentially byteswap it. We won't be able to do any
2121 * further transforms on it.
2122 */
2123 if (encrypted) {
2124 ASSERT(HDR_HAS_RABD(hdr));
2125 abd_copy_to_buf(buf->b_data, hdr->b_crypt_hdr.b_rabd,
2126 HDR_GET_PSIZE(hdr));
2127 goto byteswap;
2128 }
2129
2130 /*
69830602
TC
2131 * Adjust encrypted and authenticated headers to accomodate
2132 * the request if needed. Dnode blocks (ARC_FILL_IN_PLACE) are
2133 * allowed to fail decryption due to keys not being loaded
2134 * without being marked as an IO error.
b5256303
TC
2135 */
2136 if (HDR_PROTECTED(hdr)) {
2137 error = arc_fill_hdr_crypt(hdr, hash_lock, spa,
be9a5c35 2138 zb, !!(flags & ARC_FILL_NOAUTH));
69830602
TC
2139 if (error == EACCES && (flags & ARC_FILL_IN_PLACE) != 0) {
2140 return (error);
2141 } else if (error != 0) {
e7504d7a
TC
2142 if (hash_lock != NULL)
2143 mutex_enter(hash_lock);
2c24b5b1 2144 arc_hdr_set_flags(hdr, ARC_FLAG_IO_ERROR);
e7504d7a
TC
2145 if (hash_lock != NULL)
2146 mutex_exit(hash_lock);
b5256303 2147 return (error);
2c24b5b1 2148 }
b5256303
TC
2149 }
2150
2151 /*
2152 * There is a special case here for dnode blocks which are
2153 * decrypting their bonus buffers. These blocks may request to
2154 * be decrypted in-place. This is necessary because there may
2155 * be many dnodes pointing into this buffer and there is
2156 * currently no method to synchronize replacing the backing
2157 * b_data buffer and updating all of the pointers. Here we use
2158 * the hash lock to ensure there are no races. If the need
2159 * arises for other types to be decrypted in-place, they must
2160 * add handling here as well.
2161 */
2162 if ((flags & ARC_FILL_IN_PLACE) != 0) {
2163 ASSERT(!hdr_compressed);
2164 ASSERT(!compressed);
2165 ASSERT(!encrypted);
2166
2167 if (HDR_ENCRYPTED(hdr) && ARC_BUF_ENCRYPTED(buf)) {
2168 ASSERT3U(hdr->b_crypt_hdr.b_ot, ==, DMU_OT_DNODE);
2169
2170 if (hash_lock != NULL)
2171 mutex_enter(hash_lock);
2172 arc_buf_untransform_in_place(buf, hash_lock);
2173 if (hash_lock != NULL)
2174 mutex_exit(hash_lock);
2175
2176 /* Compute the hdr's checksum if necessary */
2177 arc_cksum_compute(buf);
2178 }
2179
2180 return (0);
2181 }
524b4217
DK
2182
2183 if (hdr_compressed == compressed) {
2aa34383 2184 if (!arc_buf_is_shared(buf)) {
a6255b7f 2185 abd_copy_to_buf(buf->b_data, hdr->b_l1hdr.b_pabd,
524b4217 2186 arc_buf_size(buf));
2aa34383 2187 }
d3c2ae1c 2188 } else {
524b4217
DK
2189 ASSERT(hdr_compressed);
2190 ASSERT(!compressed);
d3c2ae1c 2191 ASSERT3U(HDR_GET_LSIZE(hdr), !=, HDR_GET_PSIZE(hdr));
2aa34383
DK
2192
2193 /*
524b4217
DK
2194 * If the buf is sharing its data with the hdr, unlink it and
2195 * allocate a new data buffer for the buf.
2aa34383 2196 */
524b4217
DK
2197 if (arc_buf_is_shared(buf)) {
2198 ASSERT(ARC_BUF_COMPRESSED(buf));
2199
2200 /* We need to give the buf it's own b_data */
2201 buf->b_flags &= ~ARC_BUF_FLAG_SHARED;
2aa34383
DK
2202 buf->b_data =
2203 arc_get_data_buf(hdr, HDR_GET_LSIZE(hdr), buf);
2204 arc_hdr_clear_flags(hdr, ARC_FLAG_SHARED_DATA);
2205
524b4217 2206 /* Previously overhead was 0; just add new overhead */
2aa34383 2207 ARCSTAT_INCR(arcstat_overhead_size, HDR_GET_LSIZE(hdr));
524b4217
DK
2208 } else if (ARC_BUF_COMPRESSED(buf)) {
2209 /* We need to reallocate the buf's b_data */
2210 arc_free_data_buf(hdr, buf->b_data, HDR_GET_PSIZE(hdr),
2211 buf);
2212 buf->b_data =
2213 arc_get_data_buf(hdr, HDR_GET_LSIZE(hdr), buf);
2214
2215 /* We increased the size of b_data; update overhead */
2216 ARCSTAT_INCR(arcstat_overhead_size,
2217 HDR_GET_LSIZE(hdr) - HDR_GET_PSIZE(hdr));
2aa34383
DK
2218 }
2219
524b4217
DK
2220 /*
2221 * Regardless of the buf's previous compression settings, it
2222 * should not be compressed at the end of this function.
2223 */
2224 buf->b_flags &= ~ARC_BUF_FLAG_COMPRESSED;
2225
2226 /*
2227 * Try copying the data from another buf which already has a
2228 * decompressed version. If that's not possible, it's time to
2229 * bite the bullet and decompress the data from the hdr.
2230 */
2231 if (arc_buf_try_copy_decompressed_data(buf)) {
2232 /* Skip byteswapping and checksumming (already done) */
2233 ASSERT3P(hdr->b_l1hdr.b_freeze_cksum, !=, NULL);
2234 return (0);
2235 } else {
b5256303 2236 error = zio_decompress_data(HDR_GET_COMPRESS(hdr),
a6255b7f 2237 hdr->b_l1hdr.b_pabd, buf->b_data,
524b4217
DK
2238 HDR_GET_PSIZE(hdr), HDR_GET_LSIZE(hdr));
2239
2240 /*
2241 * Absent hardware errors or software bugs, this should
2242 * be impossible, but log it anyway so we can debug it.
2243 */
2244 if (error != 0) {
2245 zfs_dbgmsg(
2246 "hdr %p, compress %d, psize %d, lsize %d",
b5256303 2247 hdr, arc_hdr_get_compress(hdr),
524b4217 2248 HDR_GET_PSIZE(hdr), HDR_GET_LSIZE(hdr));
e7504d7a
TC
2249 if (hash_lock != NULL)
2250 mutex_enter(hash_lock);
2c24b5b1 2251 arc_hdr_set_flags(hdr, ARC_FLAG_IO_ERROR);
e7504d7a
TC
2252 if (hash_lock != NULL)
2253 mutex_exit(hash_lock);
524b4217
DK
2254 return (SET_ERROR(EIO));
2255 }
d3c2ae1c
GW
2256 }
2257 }
524b4217 2258
b5256303 2259byteswap:
524b4217 2260 /* Byteswap the buf's data if necessary */
d3c2ae1c
GW
2261 if (bswap != DMU_BSWAP_NUMFUNCS) {
2262 ASSERT(!HDR_SHARED_DATA(hdr));
2263 ASSERT3U(bswap, <, DMU_BSWAP_NUMFUNCS);
2264 dmu_ot_byteswap[bswap].ob_func(buf->b_data, HDR_GET_LSIZE(hdr));
2265 }
524b4217
DK
2266
2267 /* Compute the hdr's checksum if necessary */
d3c2ae1c 2268 arc_cksum_compute(buf);
524b4217 2269
d3c2ae1c
GW
2270 return (0);
2271}
2272
2273/*
b5256303
TC
2274 * If this function is being called to decrypt an encrypted buffer or verify an
2275 * authenticated one, the key must be loaded and a mapping must be made
2276 * available in the keystore via spa_keystore_create_mapping() or one of its
2277 * callers.
d3c2ae1c 2278 */
b5256303 2279int
a2c2ed1b
TC
2280arc_untransform(arc_buf_t *buf, spa_t *spa, const zbookmark_phys_t *zb,
2281 boolean_t in_place)
d3c2ae1c 2282{
a2c2ed1b 2283 int ret;
b5256303 2284 arc_fill_flags_t flags = 0;
d3c2ae1c 2285
b5256303
TC
2286 if (in_place)
2287 flags |= ARC_FILL_IN_PLACE;
2288
be9a5c35 2289 ret = arc_buf_fill(buf, spa, zb, flags);
a2c2ed1b
TC
2290 if (ret == ECKSUM) {
2291 /*
2292 * Convert authentication and decryption errors to EIO
2293 * (and generate an ereport) before leaving the ARC.
2294 */
2295 ret = SET_ERROR(EIO);
be9a5c35 2296 spa_log_error(spa, zb);
a2c2ed1b
TC
2297 zfs_ereport_post(FM_EREPORT_ZFS_AUTHENTICATION,
2298 spa, NULL, zb, NULL, 0, 0);
2299 }
2300
2301 return (ret);
d3c2ae1c
GW
2302}
2303
2304/*
2305 * Increment the amount of evictable space in the arc_state_t's refcount.
2306 * We account for the space used by the hdr and the arc buf individually
2307 * so that we can add and remove them from the refcount individually.
2308 */
34dc7c2f 2309static void
d3c2ae1c
GW
2310arc_evictable_space_increment(arc_buf_hdr_t *hdr, arc_state_t *state)
2311{
2312 arc_buf_contents_t type = arc_buf_type(hdr);
d3c2ae1c
GW
2313
2314 ASSERT(HDR_HAS_L1HDR(hdr));
2315
2316 if (GHOST_STATE(state)) {
2317 ASSERT0(hdr->b_l1hdr.b_bufcnt);
2318 ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL);
a6255b7f 2319 ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL);
b5256303 2320 ASSERT(!HDR_HAS_RABD(hdr));
2aa34383
DK
2321 (void) refcount_add_many(&state->arcs_esize[type],
2322 HDR_GET_LSIZE(hdr), hdr);
d3c2ae1c
GW
2323 return;
2324 }
2325
2326 ASSERT(!GHOST_STATE(state));
a6255b7f 2327 if (hdr->b_l1hdr.b_pabd != NULL) {
d3c2ae1c
GW
2328 (void) refcount_add_many(&state->arcs_esize[type],
2329 arc_hdr_size(hdr), hdr);
2330 }
b5256303
TC
2331 if (HDR_HAS_RABD(hdr)) {
2332 (void) refcount_add_many(&state->arcs_esize[type],
2333 HDR_GET_PSIZE(hdr), hdr);
2334 }
2335
1c27024e
DB
2336 for (arc_buf_t *buf = hdr->b_l1hdr.b_buf; buf != NULL;
2337 buf = buf->b_next) {
2aa34383 2338 if (arc_buf_is_shared(buf))
d3c2ae1c 2339 continue;
2aa34383
DK
2340 (void) refcount_add_many(&state->arcs_esize[type],
2341 arc_buf_size(buf), buf);
d3c2ae1c
GW
2342 }
2343}
2344
2345/*
2346 * Decrement the amount of evictable space in the arc_state_t's refcount.
2347 * We account for the space used by the hdr and the arc buf individually
2348 * so that we can add and remove them from the refcount individually.
2349 */
2350static void
2aa34383 2351arc_evictable_space_decrement(arc_buf_hdr_t *hdr, arc_state_t *state)
d3c2ae1c
GW
2352{
2353 arc_buf_contents_t type = arc_buf_type(hdr);
d3c2ae1c
GW
2354
2355 ASSERT(HDR_HAS_L1HDR(hdr));
2356
2357 if (GHOST_STATE(state)) {
2358 ASSERT0(hdr->b_l1hdr.b_bufcnt);
2359 ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL);
a6255b7f 2360 ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL);
b5256303 2361 ASSERT(!HDR_HAS_RABD(hdr));
d3c2ae1c 2362 (void) refcount_remove_many(&state->arcs_esize[type],
2aa34383 2363 HDR_GET_LSIZE(hdr), hdr);
d3c2ae1c
GW
2364 return;
2365 }
2366
2367 ASSERT(!GHOST_STATE(state));
a6255b7f 2368 if (hdr->b_l1hdr.b_pabd != NULL) {
d3c2ae1c
GW
2369 (void) refcount_remove_many(&state->arcs_esize[type],
2370 arc_hdr_size(hdr), hdr);
2371 }
b5256303
TC
2372 if (HDR_HAS_RABD(hdr)) {
2373 (void) refcount_remove_many(&state->arcs_esize[type],
2374 HDR_GET_PSIZE(hdr), hdr);
2375 }
2376
1c27024e
DB
2377 for (arc_buf_t *buf = hdr->b_l1hdr.b_buf; buf != NULL;
2378 buf = buf->b_next) {
2aa34383 2379 if (arc_buf_is_shared(buf))
d3c2ae1c 2380 continue;
d3c2ae1c 2381 (void) refcount_remove_many(&state->arcs_esize[type],
2aa34383 2382 arc_buf_size(buf), buf);
d3c2ae1c
GW
2383 }
2384}
2385
2386/*
2387 * Add a reference to this hdr indicating that someone is actively
2388 * referencing that memory. When the refcount transitions from 0 to 1,
2389 * we remove it from the respective arc_state_t list to indicate that
2390 * it is not evictable.
2391 */
2392static void
2393add_reference(arc_buf_hdr_t *hdr, void *tag)
34dc7c2f 2394{
b9541d6b
CW
2395 arc_state_t *state;
2396
2397 ASSERT(HDR_HAS_L1HDR(hdr));
d3c2ae1c
GW
2398 if (!MUTEX_HELD(HDR_LOCK(hdr))) {
2399 ASSERT(hdr->b_l1hdr.b_state == arc_anon);
2400 ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
2401 ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL);
2402 }
34dc7c2f 2403
b9541d6b
CW
2404 state = hdr->b_l1hdr.b_state;
2405
2406 if ((refcount_add(&hdr->b_l1hdr.b_refcnt, tag) == 1) &&
2407 (state != arc_anon)) {
2408 /* We don't use the L2-only state list. */
2409 if (state != arc_l2c_only) {
64fc7762 2410 multilist_remove(state->arcs_list[arc_buf_type(hdr)],
d3c2ae1c 2411 hdr);
2aa34383 2412 arc_evictable_space_decrement(hdr, state);
34dc7c2f 2413 }
b128c09f 2414 /* remove the prefetch flag if we get a reference */
d3c2ae1c 2415 arc_hdr_clear_flags(hdr, ARC_FLAG_PREFETCH);
34dc7c2f
BB
2416 }
2417}
2418
d3c2ae1c
GW
2419/*
2420 * Remove a reference from this hdr. When the reference transitions from
2421 * 1 to 0 and we're not anonymous, then we add this hdr to the arc_state_t's
2422 * list making it eligible for eviction.
2423 */
34dc7c2f 2424static int
2a432414 2425remove_reference(arc_buf_hdr_t *hdr, kmutex_t *hash_lock, void *tag)
34dc7c2f
BB
2426{
2427 int cnt;
b9541d6b 2428 arc_state_t *state = hdr->b_l1hdr.b_state;
34dc7c2f 2429
b9541d6b 2430 ASSERT(HDR_HAS_L1HDR(hdr));
34dc7c2f
BB
2431 ASSERT(state == arc_anon || MUTEX_HELD(hash_lock));
2432 ASSERT(!GHOST_STATE(state));
2433
b9541d6b
CW
2434 /*
2435 * arc_l2c_only counts as a ghost state so we don't need to explicitly
2436 * check to prevent usage of the arc_l2c_only list.
2437 */
2438 if (((cnt = refcount_remove(&hdr->b_l1hdr.b_refcnt, tag)) == 0) &&
34dc7c2f 2439 (state != arc_anon)) {
64fc7762 2440 multilist_insert(state->arcs_list[arc_buf_type(hdr)], hdr);
d3c2ae1c
GW
2441 ASSERT3U(hdr->b_l1hdr.b_bufcnt, >, 0);
2442 arc_evictable_space_increment(hdr, state);
34dc7c2f
BB
2443 }
2444 return (cnt);
2445}
2446
e0b0ca98
BB
2447/*
2448 * Returns detailed information about a specific arc buffer. When the
2449 * state_index argument is set the function will calculate the arc header
2450 * list position for its arc state. Since this requires a linear traversal
2451 * callers are strongly encourage not to do this. However, it can be helpful
2452 * for targeted analysis so the functionality is provided.
2453 */
2454void
2455arc_buf_info(arc_buf_t *ab, arc_buf_info_t *abi, int state_index)
2456{
2457 arc_buf_hdr_t *hdr = ab->b_hdr;
b9541d6b
CW
2458 l1arc_buf_hdr_t *l1hdr = NULL;
2459 l2arc_buf_hdr_t *l2hdr = NULL;
2460 arc_state_t *state = NULL;
2461
8887c7d7
TC
2462 memset(abi, 0, sizeof (arc_buf_info_t));
2463
2464 if (hdr == NULL)
2465 return;
2466
2467 abi->abi_flags = hdr->b_flags;
2468
b9541d6b
CW
2469 if (HDR_HAS_L1HDR(hdr)) {
2470 l1hdr = &hdr->b_l1hdr;
2471 state = l1hdr->b_state;
2472 }
2473 if (HDR_HAS_L2HDR(hdr))
2474 l2hdr = &hdr->b_l2hdr;
e0b0ca98 2475
b9541d6b 2476 if (l1hdr) {
d3c2ae1c 2477 abi->abi_bufcnt = l1hdr->b_bufcnt;
b9541d6b
CW
2478 abi->abi_access = l1hdr->b_arc_access;
2479 abi->abi_mru_hits = l1hdr->b_mru_hits;
2480 abi->abi_mru_ghost_hits = l1hdr->b_mru_ghost_hits;
2481 abi->abi_mfu_hits = l1hdr->b_mfu_hits;
2482 abi->abi_mfu_ghost_hits = l1hdr->b_mfu_ghost_hits;
2483 abi->abi_holds = refcount_count(&l1hdr->b_refcnt);
2484 }
2485
2486 if (l2hdr) {
2487 abi->abi_l2arc_dattr = l2hdr->b_daddr;
b9541d6b
CW
2488 abi->abi_l2arc_hits = l2hdr->b_hits;
2489 }
2490
e0b0ca98 2491 abi->abi_state_type = state ? state->arcs_state : ARC_STATE_ANON;
b9541d6b 2492 abi->abi_state_contents = arc_buf_type(hdr);
d3c2ae1c 2493 abi->abi_size = arc_hdr_size(hdr);
e0b0ca98
BB
2494}
2495
34dc7c2f 2496/*
ca0bf58d 2497 * Move the supplied buffer to the indicated state. The hash lock
34dc7c2f
BB
2498 * for the buffer must be held by the caller.
2499 */
2500static void
2a432414
GW
2501arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr,
2502 kmutex_t *hash_lock)
34dc7c2f 2503{
b9541d6b
CW
2504 arc_state_t *old_state;
2505 int64_t refcnt;
d3c2ae1c
GW
2506 uint32_t bufcnt;
2507 boolean_t update_old, update_new;
b9541d6b
CW
2508 arc_buf_contents_t buftype = arc_buf_type(hdr);
2509
2510 /*
2511 * We almost always have an L1 hdr here, since we call arc_hdr_realloc()
2512 * in arc_read() when bringing a buffer out of the L2ARC. However, the
2513 * L1 hdr doesn't always exist when we change state to arc_anon before
2514 * destroying a header, in which case reallocating to add the L1 hdr is
2515 * pointless.
2516 */
2517 if (HDR_HAS_L1HDR(hdr)) {
2518 old_state = hdr->b_l1hdr.b_state;
2519 refcnt = refcount_count(&hdr->b_l1hdr.b_refcnt);
d3c2ae1c 2520 bufcnt = hdr->b_l1hdr.b_bufcnt;
b5256303
TC
2521 update_old = (bufcnt > 0 || hdr->b_l1hdr.b_pabd != NULL ||
2522 HDR_HAS_RABD(hdr));
b9541d6b
CW
2523 } else {
2524 old_state = arc_l2c_only;
2525 refcnt = 0;
d3c2ae1c
GW
2526 bufcnt = 0;
2527 update_old = B_FALSE;
b9541d6b 2528 }
d3c2ae1c 2529 update_new = update_old;
34dc7c2f
BB
2530
2531 ASSERT(MUTEX_HELD(hash_lock));
e8b96c60 2532 ASSERT3P(new_state, !=, old_state);
d3c2ae1c
GW
2533 ASSERT(!GHOST_STATE(new_state) || bufcnt == 0);
2534 ASSERT(old_state != arc_anon || bufcnt <= 1);
34dc7c2f
BB
2535
2536 /*
2537 * If this buffer is evictable, transfer it from the
2538 * old state list to the new state list.
2539 */
2540 if (refcnt == 0) {
b9541d6b 2541 if (old_state != arc_anon && old_state != arc_l2c_only) {
b9541d6b 2542 ASSERT(HDR_HAS_L1HDR(hdr));
64fc7762 2543 multilist_remove(old_state->arcs_list[buftype], hdr);
34dc7c2f 2544
d3c2ae1c
GW
2545 if (GHOST_STATE(old_state)) {
2546 ASSERT0(bufcnt);
2547 ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL);
2548 update_old = B_TRUE;
34dc7c2f 2549 }
2aa34383 2550 arc_evictable_space_decrement(hdr, old_state);
34dc7c2f 2551 }
b9541d6b 2552 if (new_state != arc_anon && new_state != arc_l2c_only) {
b9541d6b
CW
2553 /*
2554 * An L1 header always exists here, since if we're
2555 * moving to some L1-cached state (i.e. not l2c_only or
2556 * anonymous), we realloc the header to add an L1hdr
2557 * beforehand.
2558 */
2559 ASSERT(HDR_HAS_L1HDR(hdr));
64fc7762 2560 multilist_insert(new_state->arcs_list[buftype], hdr);
34dc7c2f 2561
34dc7c2f 2562 if (GHOST_STATE(new_state)) {
d3c2ae1c
GW
2563 ASSERT0(bufcnt);
2564 ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL);
2565 update_new = B_TRUE;
34dc7c2f 2566 }
d3c2ae1c 2567 arc_evictable_space_increment(hdr, new_state);
34dc7c2f
BB
2568 }
2569 }
2570
d3c2ae1c 2571 ASSERT(!HDR_EMPTY(hdr));
2a432414
GW
2572 if (new_state == arc_anon && HDR_IN_HASH_TABLE(hdr))
2573 buf_hash_remove(hdr);
34dc7c2f 2574
b9541d6b 2575 /* adjust state sizes (ignore arc_l2c_only) */
36da08ef 2576
d3c2ae1c 2577 if (update_new && new_state != arc_l2c_only) {
36da08ef
PS
2578 ASSERT(HDR_HAS_L1HDR(hdr));
2579 if (GHOST_STATE(new_state)) {
d3c2ae1c 2580 ASSERT0(bufcnt);
36da08ef
PS
2581
2582 /*
d3c2ae1c 2583 * When moving a header to a ghost state, we first
36da08ef 2584 * remove all arc buffers. Thus, we'll have a
d3c2ae1c 2585 * bufcnt of zero, and no arc buffer to use for
36da08ef
PS
2586 * the reference. As a result, we use the arc
2587 * header pointer for the reference.
2588 */
2589 (void) refcount_add_many(&new_state->arcs_size,
d3c2ae1c 2590 HDR_GET_LSIZE(hdr), hdr);
a6255b7f 2591 ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL);
b5256303 2592 ASSERT(!HDR_HAS_RABD(hdr));
36da08ef 2593 } else {
d3c2ae1c 2594 uint32_t buffers = 0;
36da08ef
PS
2595
2596 /*
2597 * Each individual buffer holds a unique reference,
2598 * thus we must remove each of these references one
2599 * at a time.
2600 */
1c27024e 2601 for (arc_buf_t *buf = hdr->b_l1hdr.b_buf; buf != NULL;
36da08ef 2602 buf = buf->b_next) {
d3c2ae1c
GW
2603 ASSERT3U(bufcnt, !=, 0);
2604 buffers++;
2605
2606 /*
2607 * When the arc_buf_t is sharing the data
2608 * block with the hdr, the owner of the
2609 * reference belongs to the hdr. Only
2610 * add to the refcount if the arc_buf_t is
2611 * not shared.
2612 */
2aa34383 2613 if (arc_buf_is_shared(buf))
d3c2ae1c 2614 continue;
d3c2ae1c 2615
36da08ef 2616 (void) refcount_add_many(&new_state->arcs_size,
2aa34383 2617 arc_buf_size(buf), buf);
d3c2ae1c
GW
2618 }
2619 ASSERT3U(bufcnt, ==, buffers);
2620
a6255b7f 2621 if (hdr->b_l1hdr.b_pabd != NULL) {
d3c2ae1c
GW
2622 (void) refcount_add_many(&new_state->arcs_size,
2623 arc_hdr_size(hdr), hdr);
b5256303
TC
2624 }
2625
2626 if (HDR_HAS_RABD(hdr)) {
2627 (void) refcount_add_many(&new_state->arcs_size,
2628 HDR_GET_PSIZE(hdr), hdr);
36da08ef
PS
2629 }
2630 }
2631 }
2632
d3c2ae1c 2633 if (update_old && old_state != arc_l2c_only) {
36da08ef
PS
2634 ASSERT(HDR_HAS_L1HDR(hdr));
2635 if (GHOST_STATE(old_state)) {
d3c2ae1c 2636 ASSERT0(bufcnt);
a6255b7f 2637 ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL);
b5256303 2638 ASSERT(!HDR_HAS_RABD(hdr));
d3c2ae1c 2639
36da08ef
PS
2640 /*
2641 * When moving a header off of a ghost state,
d3c2ae1c
GW
2642 * the header will not contain any arc buffers.
2643 * We use the arc header pointer for the reference
2644 * which is exactly what we did when we put the
2645 * header on the ghost state.
36da08ef
PS
2646 */
2647
36da08ef 2648 (void) refcount_remove_many(&old_state->arcs_size,
d3c2ae1c 2649 HDR_GET_LSIZE(hdr), hdr);
36da08ef 2650 } else {
d3c2ae1c 2651 uint32_t buffers = 0;
36da08ef
PS
2652
2653 /*
2654 * Each individual buffer holds a unique reference,
2655 * thus we must remove each of these references one
2656 * at a time.
2657 */
1c27024e 2658 for (arc_buf_t *buf = hdr->b_l1hdr.b_buf; buf != NULL;
36da08ef 2659 buf = buf->b_next) {
d3c2ae1c
GW
2660 ASSERT3U(bufcnt, !=, 0);
2661 buffers++;
2662
2663 /*
2664 * When the arc_buf_t is sharing the data
2665 * block with the hdr, the owner of the
2666 * reference belongs to the hdr. Only
2667 * add to the refcount if the arc_buf_t is
2668 * not shared.
2669 */
2aa34383 2670 if (arc_buf_is_shared(buf))
d3c2ae1c 2671 continue;
d3c2ae1c 2672
36da08ef 2673 (void) refcount_remove_many(
2aa34383 2674 &old_state->arcs_size, arc_buf_size(buf),
d3c2ae1c 2675 buf);
36da08ef 2676 }
d3c2ae1c 2677 ASSERT3U(bufcnt, ==, buffers);
b5256303
TC
2678 ASSERT(hdr->b_l1hdr.b_pabd != NULL ||
2679 HDR_HAS_RABD(hdr));
2680
2681 if (hdr->b_l1hdr.b_pabd != NULL) {
2682 (void) refcount_remove_many(
2683 &old_state->arcs_size, arc_hdr_size(hdr),
2684 hdr);
2685 }
2686
2687 if (HDR_HAS_RABD(hdr)) {
2688 (void) refcount_remove_many(
2689 &old_state->arcs_size, HDR_GET_PSIZE(hdr),
2690 hdr);
2691 }
36da08ef 2692 }
34dc7c2f 2693 }
36da08ef 2694
b9541d6b
CW
2695 if (HDR_HAS_L1HDR(hdr))
2696 hdr->b_l1hdr.b_state = new_state;
34dc7c2f 2697
b9541d6b
CW
2698 /*
2699 * L2 headers should never be on the L2 state list since they don't
2700 * have L1 headers allocated.
2701 */
64fc7762
MA
2702 ASSERT(multilist_is_empty(arc_l2c_only->arcs_list[ARC_BUFC_DATA]) &&
2703 multilist_is_empty(arc_l2c_only->arcs_list[ARC_BUFC_METADATA]));
34dc7c2f
BB
2704}
2705
2706void
d164b209 2707arc_space_consume(uint64_t space, arc_space_type_t type)
34dc7c2f 2708{
d164b209
BB
2709 ASSERT(type >= 0 && type < ARC_SPACE_NUMTYPES);
2710
2711 switch (type) {
e75c13c3
BB
2712 default:
2713 break;
d164b209 2714 case ARC_SPACE_DATA:
37fb3e43 2715 aggsum_add(&astat_data_size, space);
d164b209 2716 break;
cc7f677c 2717 case ARC_SPACE_META:
37fb3e43 2718 aggsum_add(&astat_metadata_size, space);
cc7f677c 2719 break;
25458cbe 2720 case ARC_SPACE_BONUS:
37fb3e43 2721 aggsum_add(&astat_bonus_size, space);
25458cbe
TC
2722 break;
2723 case ARC_SPACE_DNODE:
37fb3e43 2724 aggsum_add(&astat_dnode_size, space);
25458cbe
TC
2725 break;
2726 case ARC_SPACE_DBUF:
37fb3e43 2727 aggsum_add(&astat_dbuf_size, space);
d164b209
BB
2728 break;
2729 case ARC_SPACE_HDRS:
37fb3e43 2730 aggsum_add(&astat_hdr_size, space);
d164b209
BB
2731 break;
2732 case ARC_SPACE_L2HDRS:
37fb3e43 2733 aggsum_add(&astat_l2_hdr_size, space);
d164b209
BB
2734 break;
2735 }
2736
500445c0 2737 if (type != ARC_SPACE_DATA)
37fb3e43 2738 aggsum_add(&arc_meta_used, space);
cc7f677c 2739
37fb3e43 2740 aggsum_add(&arc_size, space);
34dc7c2f
BB
2741}
2742
2743void
d164b209 2744arc_space_return(uint64_t space, arc_space_type_t type)
34dc7c2f 2745{
d164b209
BB
2746 ASSERT(type >= 0 && type < ARC_SPACE_NUMTYPES);
2747
2748 switch (type) {
e75c13c3
BB
2749 default:
2750 break;
d164b209 2751 case ARC_SPACE_DATA:
37fb3e43 2752 aggsum_add(&astat_data_size, -space);
d164b209 2753 break;
cc7f677c 2754 case ARC_SPACE_META:
37fb3e43 2755 aggsum_add(&astat_metadata_size, -space);
cc7f677c 2756 break;
25458cbe 2757 case ARC_SPACE_BONUS:
37fb3e43 2758 aggsum_add(&astat_bonus_size, -space);
25458cbe
TC
2759 break;
2760 case ARC_SPACE_DNODE:
37fb3e43 2761 aggsum_add(&astat_dnode_size, -space);
25458cbe
TC
2762 break;
2763 case ARC_SPACE_DBUF:
37fb3e43 2764 aggsum_add(&astat_dbuf_size, -space);
d164b209
BB
2765 break;
2766 case ARC_SPACE_HDRS:
37fb3e43 2767 aggsum_add(&astat_hdr_size, -space);
d164b209
BB
2768 break;
2769 case ARC_SPACE_L2HDRS:
37fb3e43 2770 aggsum_add(&astat_l2_hdr_size, -space);
d164b209
BB
2771 break;
2772 }
2773
cc7f677c 2774 if (type != ARC_SPACE_DATA) {
37fb3e43
PD
2775 ASSERT(aggsum_compare(&arc_meta_used, space) >= 0);
2776 /*
2777 * We use the upper bound here rather than the precise value
2778 * because the arc_meta_max value doesn't need to be
2779 * precise. It's only consumed by humans via arcstats.
2780 */
2781 if (arc_meta_max < aggsum_upper_bound(&arc_meta_used))
2782 arc_meta_max = aggsum_upper_bound(&arc_meta_used);
2783 aggsum_add(&arc_meta_used, -space);
cc7f677c
PS
2784 }
2785
37fb3e43
PD
2786 ASSERT(aggsum_compare(&arc_size, space) >= 0);
2787 aggsum_add(&arc_size, -space);
34dc7c2f
BB
2788}
2789
d3c2ae1c 2790/*
524b4217 2791 * Given a hdr and a buf, returns whether that buf can share its b_data buffer
a6255b7f 2792 * with the hdr's b_pabd.
d3c2ae1c 2793 */
524b4217
DK
2794static boolean_t
2795arc_can_share(arc_buf_hdr_t *hdr, arc_buf_t *buf)
2796{
524b4217
DK
2797 /*
2798 * The criteria for sharing a hdr's data are:
b5256303
TC
2799 * 1. the buffer is not encrypted
2800 * 2. the hdr's compression matches the buf's compression
2801 * 3. the hdr doesn't need to be byteswapped
2802 * 4. the hdr isn't already being shared
2803 * 5. the buf is either compressed or it is the last buf in the hdr list
524b4217 2804 *
b5256303 2805 * Criterion #5 maintains the invariant that shared uncompressed
524b4217
DK
2806 * bufs must be the final buf in the hdr's b_buf list. Reading this, you
2807 * might ask, "if a compressed buf is allocated first, won't that be the
2808 * last thing in the list?", but in that case it's impossible to create
2809 * a shared uncompressed buf anyway (because the hdr must be compressed
2810 * to have the compressed buf). You might also think that #3 is
2811 * sufficient to make this guarantee, however it's possible
2812 * (specifically in the rare L2ARC write race mentioned in
2813 * arc_buf_alloc_impl()) there will be an existing uncompressed buf that
2814 * is sharable, but wasn't at the time of its allocation. Rather than
2815 * allow a new shared uncompressed buf to be created and then shuffle
2816 * the list around to make it the last element, this simply disallows
2817 * sharing if the new buf isn't the first to be added.
2818 */
2819 ASSERT3P(buf->b_hdr, ==, hdr);
b5256303
TC
2820 boolean_t hdr_compressed =
2821 arc_hdr_get_compress(hdr) != ZIO_COMPRESS_OFF;
a7004725 2822 boolean_t buf_compressed = ARC_BUF_COMPRESSED(buf) != 0;
b5256303
TC
2823 return (!ARC_BUF_ENCRYPTED(buf) &&
2824 buf_compressed == hdr_compressed &&
524b4217
DK
2825 hdr->b_l1hdr.b_byteswap == DMU_BSWAP_NUMFUNCS &&
2826 !HDR_SHARED_DATA(hdr) &&
2827 (ARC_BUF_LAST(buf) || ARC_BUF_COMPRESSED(buf)));
2828}
2829
2830/*
2831 * Allocate a buf for this hdr. If you care about the data that's in the hdr,
2832 * or if you want a compressed buffer, pass those flags in. Returns 0 if the
2833 * copy was made successfully, or an error code otherwise.
2834 */
2835static int
be9a5c35
TC
2836arc_buf_alloc_impl(arc_buf_hdr_t *hdr, spa_t *spa, const zbookmark_phys_t *zb,
2837 void *tag, boolean_t encrypted, boolean_t compressed, boolean_t noauth,
524b4217 2838 boolean_t fill, arc_buf_t **ret)
34dc7c2f 2839{
34dc7c2f 2840 arc_buf_t *buf;
b5256303 2841 arc_fill_flags_t flags = ARC_FILL_LOCKED;
34dc7c2f 2842
d3c2ae1c
GW
2843 ASSERT(HDR_HAS_L1HDR(hdr));
2844 ASSERT3U(HDR_GET_LSIZE(hdr), >, 0);
2845 VERIFY(hdr->b_type == ARC_BUFC_DATA ||
2846 hdr->b_type == ARC_BUFC_METADATA);
524b4217
DK
2847 ASSERT3P(ret, !=, NULL);
2848 ASSERT3P(*ret, ==, NULL);
b5256303 2849 IMPLY(encrypted, compressed);
d3c2ae1c 2850
b9541d6b
CW
2851 hdr->b_l1hdr.b_mru_hits = 0;
2852 hdr->b_l1hdr.b_mru_ghost_hits = 0;
2853 hdr->b_l1hdr.b_mfu_hits = 0;
2854 hdr->b_l1hdr.b_mfu_ghost_hits = 0;
2855 hdr->b_l1hdr.b_l2_hits = 0;
2856
524b4217 2857 buf = *ret = kmem_cache_alloc(buf_cache, KM_PUSHPAGE);
34dc7c2f
BB
2858 buf->b_hdr = hdr;
2859 buf->b_data = NULL;
2aa34383 2860 buf->b_next = hdr->b_l1hdr.b_buf;
524b4217 2861 buf->b_flags = 0;
b9541d6b 2862
d3c2ae1c
GW
2863 add_reference(hdr, tag);
2864
2865 /*
2866 * We're about to change the hdr's b_flags. We must either
2867 * hold the hash_lock or be undiscoverable.
2868 */
2869 ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr));
2870
2871 /*
524b4217 2872 * Only honor requests for compressed bufs if the hdr is actually
b5256303
TC
2873 * compressed. This must be overriden if the buffer is encrypted since
2874 * encrypted buffers cannot be decompressed.
524b4217 2875 */
b5256303
TC
2876 if (encrypted) {
2877 buf->b_flags |= ARC_BUF_FLAG_COMPRESSED;
2878 buf->b_flags |= ARC_BUF_FLAG_ENCRYPTED;
2879 flags |= ARC_FILL_COMPRESSED | ARC_FILL_ENCRYPTED;
2880 } else if (compressed &&
2881 arc_hdr_get_compress(hdr) != ZIO_COMPRESS_OFF) {
524b4217 2882 buf->b_flags |= ARC_BUF_FLAG_COMPRESSED;
b5256303
TC
2883 flags |= ARC_FILL_COMPRESSED;
2884 }
2885
2886 if (noauth) {
2887 ASSERT0(encrypted);
2888 flags |= ARC_FILL_NOAUTH;
2889 }
524b4217 2890
524b4217
DK
2891 /*
2892 * If the hdr's data can be shared then we share the data buffer and
2893 * set the appropriate bit in the hdr's b_flags to indicate the hdr is
2aa34383 2894 * allocate a new buffer to store the buf's data.
524b4217 2895 *
a6255b7f
DQ
2896 * There are two additional restrictions here because we're sharing
2897 * hdr -> buf instead of the usual buf -> hdr. First, the hdr can't be
2898 * actively involved in an L2ARC write, because if this buf is used by
2899 * an arc_write() then the hdr's data buffer will be released when the
524b4217 2900 * write completes, even though the L2ARC write might still be using it.
a6255b7f
DQ
2901 * Second, the hdr's ABD must be linear so that the buf's user doesn't
2902 * need to be ABD-aware.
d3c2ae1c 2903 */
a7004725 2904 boolean_t can_share = arc_can_share(hdr, buf) && !HDR_L2_WRITING(hdr) &&
b5256303 2905 hdr->b_l1hdr.b_pabd != NULL && abd_is_linear(hdr->b_l1hdr.b_pabd);
524b4217
DK
2906
2907 /* Set up b_data and sharing */
2908 if (can_share) {
a6255b7f 2909 buf->b_data = abd_to_buf(hdr->b_l1hdr.b_pabd);
524b4217 2910 buf->b_flags |= ARC_BUF_FLAG_SHARED;
d3c2ae1c
GW
2911 arc_hdr_set_flags(hdr, ARC_FLAG_SHARED_DATA);
2912 } else {
524b4217
DK
2913 buf->b_data =
2914 arc_get_data_buf(hdr, arc_buf_size(buf), buf);
2915 ARCSTAT_INCR(arcstat_overhead_size, arc_buf_size(buf));
d3c2ae1c
GW
2916 }
2917 VERIFY3P(buf->b_data, !=, NULL);
b9541d6b
CW
2918
2919 hdr->b_l1hdr.b_buf = buf;
d3c2ae1c 2920 hdr->b_l1hdr.b_bufcnt += 1;
b5256303
TC
2921 if (encrypted)
2922 hdr->b_crypt_hdr.b_ebufcnt += 1;
b9541d6b 2923
524b4217
DK
2924 /*
2925 * If the user wants the data from the hdr, we need to either copy or
2926 * decompress the data.
2927 */
2928 if (fill) {
be9a5c35
TC
2929 ASSERT3P(zb, !=, NULL);
2930 return (arc_buf_fill(buf, spa, zb, flags));
524b4217 2931 }
d3c2ae1c 2932
524b4217 2933 return (0);
34dc7c2f
BB
2934}
2935
9babb374
BB
2936static char *arc_onloan_tag = "onloan";
2937
a7004725
DK
2938static inline void
2939arc_loaned_bytes_update(int64_t delta)
2940{
2941 atomic_add_64(&arc_loaned_bytes, delta);
2942
2943 /* assert that it did not wrap around */
2944 ASSERT3S(atomic_add_64_nv(&arc_loaned_bytes, 0), >=, 0);
2945}
2946
9babb374
BB
2947/*
2948 * Loan out an anonymous arc buffer. Loaned buffers are not counted as in
2949 * flight data by arc_tempreserve_space() until they are "returned". Loaned
2950 * buffers must be returned to the arc before they can be used by the DMU or
2951 * freed.
2952 */
2953arc_buf_t *
2aa34383 2954arc_loan_buf(spa_t *spa, boolean_t is_metadata, int size)
9babb374 2955{
2aa34383
DK
2956 arc_buf_t *buf = arc_alloc_buf(spa, arc_onloan_tag,
2957 is_metadata ? ARC_BUFC_METADATA : ARC_BUFC_DATA, size);
9babb374 2958
5152a740 2959 arc_loaned_bytes_update(arc_buf_size(buf));
a7004725 2960
9babb374
BB
2961 return (buf);
2962}
2963
2aa34383
DK
2964arc_buf_t *
2965arc_loan_compressed_buf(spa_t *spa, uint64_t psize, uint64_t lsize,
2966 enum zio_compress compression_type)
2967{
2968 arc_buf_t *buf = arc_alloc_compressed_buf(spa, arc_onloan_tag,
2969 psize, lsize, compression_type);
2970
5152a740 2971 arc_loaned_bytes_update(arc_buf_size(buf));
a7004725 2972
2aa34383
DK
2973 return (buf);
2974}
2975
b5256303
TC
2976arc_buf_t *
2977arc_loan_raw_buf(spa_t *spa, uint64_t dsobj, boolean_t byteorder,
2978 const uint8_t *salt, const uint8_t *iv, const uint8_t *mac,
2979 dmu_object_type_t ot, uint64_t psize, uint64_t lsize,
2980 enum zio_compress compression_type)
2981{
2982 arc_buf_t *buf = arc_alloc_raw_buf(spa, arc_onloan_tag, dsobj,
2983 byteorder, salt, iv, mac, ot, psize, lsize, compression_type);
2984
2985 atomic_add_64(&arc_loaned_bytes, psize);
2986 return (buf);
2987}
2988
2aa34383 2989
9babb374
BB
2990/*
2991 * Return a loaned arc buffer to the arc.
2992 */
2993void
2994arc_return_buf(arc_buf_t *buf, void *tag)
2995{
2996 arc_buf_hdr_t *hdr = buf->b_hdr;
2997
d3c2ae1c 2998 ASSERT3P(buf->b_data, !=, NULL);
b9541d6b
CW
2999 ASSERT(HDR_HAS_L1HDR(hdr));
3000 (void) refcount_add(&hdr->b_l1hdr.b_refcnt, tag);
3001 (void) refcount_remove(&hdr->b_l1hdr.b_refcnt, arc_onloan_tag);
9babb374 3002
a7004725 3003 arc_loaned_bytes_update(-arc_buf_size(buf));
9babb374
BB
3004}
3005
428870ff
BB
3006/* Detach an arc_buf from a dbuf (tag) */
3007void
3008arc_loan_inuse_buf(arc_buf_t *buf, void *tag)
3009{
b9541d6b 3010 arc_buf_hdr_t *hdr = buf->b_hdr;
428870ff 3011
d3c2ae1c 3012 ASSERT3P(buf->b_data, !=, NULL);
b9541d6b
CW
3013 ASSERT(HDR_HAS_L1HDR(hdr));
3014 (void) refcount_add(&hdr->b_l1hdr.b_refcnt, arc_onloan_tag);
3015 (void) refcount_remove(&hdr->b_l1hdr.b_refcnt, tag);
428870ff 3016
a7004725 3017 arc_loaned_bytes_update(arc_buf_size(buf));
428870ff
BB
3018}
3019
d3c2ae1c 3020static void
a6255b7f 3021l2arc_free_abd_on_write(abd_t *abd, size_t size, arc_buf_contents_t type)
34dc7c2f 3022{
d3c2ae1c 3023 l2arc_data_free_t *df = kmem_alloc(sizeof (*df), KM_SLEEP);
34dc7c2f 3024
a6255b7f 3025 df->l2df_abd = abd;
d3c2ae1c
GW
3026 df->l2df_size = size;
3027 df->l2df_type = type;
3028 mutex_enter(&l2arc_free_on_write_mtx);
3029 list_insert_head(l2arc_free_on_write, df);
3030 mutex_exit(&l2arc_free_on_write_mtx);
3031}
428870ff 3032
d3c2ae1c 3033static void
b5256303 3034arc_hdr_free_on_write(arc_buf_hdr_t *hdr, boolean_t free_rdata)
d3c2ae1c
GW
3035{
3036 arc_state_t *state = hdr->b_l1hdr.b_state;
3037 arc_buf_contents_t type = arc_buf_type(hdr);
b5256303 3038 uint64_t size = (free_rdata) ? HDR_GET_PSIZE(hdr) : arc_hdr_size(hdr);
1eb5bfa3 3039
d3c2ae1c
GW
3040 /* protected by hash lock, if in the hash table */
3041 if (multilist_link_active(&hdr->b_l1hdr.b_arc_node)) {
3042 ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
3043 ASSERT(state != arc_anon && state != arc_l2c_only);
3044
3045 (void) refcount_remove_many(&state->arcs_esize[type],
3046 size, hdr);
1eb5bfa3 3047 }
d3c2ae1c 3048 (void) refcount_remove_many(&state->arcs_size, size, hdr);
423e7b62
AG
3049 if (type == ARC_BUFC_METADATA) {
3050 arc_space_return(size, ARC_SPACE_META);
3051 } else {
3052 ASSERT(type == ARC_BUFC_DATA);
3053 arc_space_return(size, ARC_SPACE_DATA);
3054 }
d3c2ae1c 3055
b5256303
TC
3056 if (free_rdata) {
3057 l2arc_free_abd_on_write(hdr->b_crypt_hdr.b_rabd, size, type);
3058 } else {
3059 l2arc_free_abd_on_write(hdr->b_l1hdr.b_pabd, size, type);
3060 }
34dc7c2f
BB
3061}
3062
d3c2ae1c
GW
3063/*
3064 * Share the arc_buf_t's data with the hdr. Whenever we are sharing the
3065 * data buffer, we transfer the refcount ownership to the hdr and update
3066 * the appropriate kstats.
3067 */
3068static void
3069arc_share_buf(arc_buf_hdr_t *hdr, arc_buf_t *buf)
34dc7c2f 3070{
524b4217 3071 ASSERT(arc_can_share(hdr, buf));
a6255b7f 3072 ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL);
b5256303 3073 ASSERT(!ARC_BUF_ENCRYPTED(buf));
d3c2ae1c 3074 ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr));
34dc7c2f
BB
3075
3076 /*
d3c2ae1c
GW
3077 * Start sharing the data buffer. We transfer the
3078 * refcount ownership to the hdr since it always owns
3079 * the refcount whenever an arc_buf_t is shared.
34dc7c2f 3080 */
d3c2ae1c 3081 refcount_transfer_ownership(&hdr->b_l1hdr.b_state->arcs_size, buf, hdr);
a6255b7f
DQ
3082 hdr->b_l1hdr.b_pabd = abd_get_from_buf(buf->b_data, arc_buf_size(buf));
3083 abd_take_ownership_of_buf(hdr->b_l1hdr.b_pabd,
3084 HDR_ISTYPE_METADATA(hdr));
d3c2ae1c 3085 arc_hdr_set_flags(hdr, ARC_FLAG_SHARED_DATA);
524b4217 3086 buf->b_flags |= ARC_BUF_FLAG_SHARED;
34dc7c2f 3087
d3c2ae1c
GW
3088 /*
3089 * Since we've transferred ownership to the hdr we need
3090 * to increment its compressed and uncompressed kstats and
3091 * decrement the overhead size.
3092 */
3093 ARCSTAT_INCR(arcstat_compressed_size, arc_hdr_size(hdr));
3094 ARCSTAT_INCR(arcstat_uncompressed_size, HDR_GET_LSIZE(hdr));
2aa34383 3095 ARCSTAT_INCR(arcstat_overhead_size, -arc_buf_size(buf));
34dc7c2f
BB
3096}
3097
ca0bf58d 3098static void
d3c2ae1c 3099arc_unshare_buf(arc_buf_hdr_t *hdr, arc_buf_t *buf)
ca0bf58d 3100{
d3c2ae1c 3101 ASSERT(arc_buf_is_shared(buf));
a6255b7f 3102 ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL);
d3c2ae1c 3103 ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr));
ca0bf58d 3104
d3c2ae1c
GW
3105 /*
3106 * We are no longer sharing this buffer so we need
3107 * to transfer its ownership to the rightful owner.
3108 */
3109 refcount_transfer_ownership(&hdr->b_l1hdr.b_state->arcs_size, hdr, buf);
3110 arc_hdr_clear_flags(hdr, ARC_FLAG_SHARED_DATA);
a6255b7f
DQ
3111 abd_release_ownership_of_buf(hdr->b_l1hdr.b_pabd);
3112 abd_put(hdr->b_l1hdr.b_pabd);
3113 hdr->b_l1hdr.b_pabd = NULL;
524b4217 3114 buf->b_flags &= ~ARC_BUF_FLAG_SHARED;
d3c2ae1c
GW
3115
3116 /*
3117 * Since the buffer is no longer shared between
3118 * the arc buf and the hdr, count it as overhead.
3119 */
3120 ARCSTAT_INCR(arcstat_compressed_size, -arc_hdr_size(hdr));
3121 ARCSTAT_INCR(arcstat_uncompressed_size, -HDR_GET_LSIZE(hdr));
2aa34383 3122 ARCSTAT_INCR(arcstat_overhead_size, arc_buf_size(buf));
ca0bf58d
PS
3123}
3124
34dc7c2f 3125/*
2aa34383
DK
3126 * Remove an arc_buf_t from the hdr's buf list and return the last
3127 * arc_buf_t on the list. If no buffers remain on the list then return
3128 * NULL.
3129 */
3130static arc_buf_t *
3131arc_buf_remove(arc_buf_hdr_t *hdr, arc_buf_t *buf)
3132{
2aa34383
DK
3133 ASSERT(HDR_HAS_L1HDR(hdr));
3134 ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr));
3135
a7004725
DK
3136 arc_buf_t **bufp = &hdr->b_l1hdr.b_buf;
3137 arc_buf_t *lastbuf = NULL;
3138
2aa34383
DK
3139 /*
3140 * Remove the buf from the hdr list and locate the last
3141 * remaining buffer on the list.
3142 */
3143 while (*bufp != NULL) {
3144 if (*bufp == buf)
3145 *bufp = buf->b_next;
3146
3147 /*
3148 * If we've removed a buffer in the middle of
3149 * the list then update the lastbuf and update
3150 * bufp.
3151 */
3152 if (*bufp != NULL) {
3153 lastbuf = *bufp;
3154 bufp = &(*bufp)->b_next;
3155 }
3156 }
3157 buf->b_next = NULL;
3158 ASSERT3P(lastbuf, !=, buf);
3159 IMPLY(hdr->b_l1hdr.b_bufcnt > 0, lastbuf != NULL);
3160 IMPLY(hdr->b_l1hdr.b_bufcnt > 0, hdr->b_l1hdr.b_buf != NULL);
3161 IMPLY(lastbuf != NULL, ARC_BUF_LAST(lastbuf));
3162
3163 return (lastbuf);
3164}
3165
3166/*
3167 * Free up buf->b_data and pull the arc_buf_t off of the the arc_buf_hdr_t's
3168 * list and free it.
34dc7c2f
BB
3169 */
3170static void
2aa34383 3171arc_buf_destroy_impl(arc_buf_t *buf)
34dc7c2f 3172{
498877ba 3173 arc_buf_hdr_t *hdr = buf->b_hdr;
ca0bf58d
PS
3174
3175 /*
524b4217
DK
3176 * Free up the data associated with the buf but only if we're not
3177 * sharing this with the hdr. If we are sharing it with the hdr, the
3178 * hdr is responsible for doing the free.
ca0bf58d 3179 */
d3c2ae1c
GW
3180 if (buf->b_data != NULL) {
3181 /*
3182 * We're about to change the hdr's b_flags. We must either
3183 * hold the hash_lock or be undiscoverable.
3184 */
3185 ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr));
3186
524b4217 3187 arc_cksum_verify(buf);
d3c2ae1c
GW
3188 arc_buf_unwatch(buf);
3189
2aa34383 3190 if (arc_buf_is_shared(buf)) {
d3c2ae1c
GW
3191 arc_hdr_clear_flags(hdr, ARC_FLAG_SHARED_DATA);
3192 } else {
2aa34383 3193 uint64_t size = arc_buf_size(buf);
d3c2ae1c
GW
3194 arc_free_data_buf(hdr, buf->b_data, size, buf);
3195 ARCSTAT_INCR(arcstat_overhead_size, -size);
3196 }
3197 buf->b_data = NULL;
3198
3199 ASSERT(hdr->b_l1hdr.b_bufcnt > 0);
3200 hdr->b_l1hdr.b_bufcnt -= 1;
b5256303 3201
da5d4697 3202 if (ARC_BUF_ENCRYPTED(buf)) {
b5256303
TC
3203 hdr->b_crypt_hdr.b_ebufcnt -= 1;
3204
da5d4697
D
3205 /*
3206 * If we have no more encrypted buffers and we've
3207 * already gotten a copy of the decrypted data we can
3208 * free b_rabd to save some space.
3209 */
3210 if (hdr->b_crypt_hdr.b_ebufcnt == 0 &&
3211 HDR_HAS_RABD(hdr) && hdr->b_l1hdr.b_pabd != NULL &&
3212 !HDR_IO_IN_PROGRESS(hdr)) {
3213 arc_hdr_free_abd(hdr, B_TRUE);
3214 }
440a3eb9 3215 }
d3c2ae1c
GW
3216 }
3217
a7004725 3218 arc_buf_t *lastbuf = arc_buf_remove(hdr, buf);
d3c2ae1c 3219
524b4217 3220 if (ARC_BUF_SHARED(buf) && !ARC_BUF_COMPRESSED(buf)) {
2aa34383 3221 /*
524b4217 3222 * If the current arc_buf_t is sharing its data buffer with the
a6255b7f 3223 * hdr, then reassign the hdr's b_pabd to share it with the new
524b4217
DK
3224 * buffer at the end of the list. The shared buffer is always
3225 * the last one on the hdr's buffer list.
3226 *
3227 * There is an equivalent case for compressed bufs, but since
3228 * they aren't guaranteed to be the last buf in the list and
3229 * that is an exceedingly rare case, we just allow that space be
b5256303
TC
3230 * wasted temporarily. We must also be careful not to share
3231 * encrypted buffers, since they cannot be shared.
2aa34383 3232 */
b5256303 3233 if (lastbuf != NULL && !ARC_BUF_ENCRYPTED(lastbuf)) {
524b4217 3234 /* Only one buf can be shared at once */
2aa34383 3235 VERIFY(!arc_buf_is_shared(lastbuf));
524b4217
DK
3236 /* hdr is uncompressed so can't have compressed buf */
3237 VERIFY(!ARC_BUF_COMPRESSED(lastbuf));
d3c2ae1c 3238
a6255b7f 3239 ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL);
b5256303 3240 arc_hdr_free_abd(hdr, B_FALSE);
d3c2ae1c 3241
2aa34383
DK
3242 /*
3243 * We must setup a new shared block between the
3244 * last buffer and the hdr. The data would have
3245 * been allocated by the arc buf so we need to transfer
3246 * ownership to the hdr since it's now being shared.
3247 */
3248 arc_share_buf(hdr, lastbuf);
3249 }
3250 } else if (HDR_SHARED_DATA(hdr)) {
d3c2ae1c 3251 /*
2aa34383
DK
3252 * Uncompressed shared buffers are always at the end
3253 * of the list. Compressed buffers don't have the
3254 * same requirements. This makes it hard to
3255 * simply assert that the lastbuf is shared so
3256 * we rely on the hdr's compression flags to determine
3257 * if we have a compressed, shared buffer.
d3c2ae1c 3258 */
2aa34383
DK
3259 ASSERT3P(lastbuf, !=, NULL);
3260 ASSERT(arc_buf_is_shared(lastbuf) ||
b5256303 3261 arc_hdr_get_compress(hdr) != ZIO_COMPRESS_OFF);
ca0bf58d
PS
3262 }
3263
a7004725
DK
3264 /*
3265 * Free the checksum if we're removing the last uncompressed buf from
3266 * this hdr.
3267 */
3268 if (!arc_hdr_has_uncompressed_buf(hdr)) {
d3c2ae1c 3269 arc_cksum_free(hdr);
a7004725 3270 }
d3c2ae1c
GW
3271
3272 /* clean up the buf */
3273 buf->b_hdr = NULL;
3274 kmem_cache_free(buf_cache, buf);
3275}
3276
3277static void
b5256303 3278arc_hdr_alloc_abd(arc_buf_hdr_t *hdr, boolean_t alloc_rdata)
d3c2ae1c 3279{
b5256303
TC
3280 uint64_t size;
3281
d3c2ae1c
GW
3282 ASSERT3U(HDR_GET_LSIZE(hdr), >, 0);
3283 ASSERT(HDR_HAS_L1HDR(hdr));
b5256303
TC
3284 ASSERT(!HDR_SHARED_DATA(hdr) || alloc_rdata);
3285 IMPLY(alloc_rdata, HDR_PROTECTED(hdr));
d3c2ae1c 3286
b5256303
TC
3287 if (alloc_rdata) {
3288 size = HDR_GET_PSIZE(hdr);
3289 ASSERT3P(hdr->b_crypt_hdr.b_rabd, ==, NULL);
3290 hdr->b_crypt_hdr.b_rabd = arc_get_data_abd(hdr, size, hdr);
3291 ASSERT3P(hdr->b_crypt_hdr.b_rabd, !=, NULL);
3292 ARCSTAT_INCR(arcstat_raw_size, size);
3293 } else {
3294 size = arc_hdr_size(hdr);
3295 ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL);
3296 hdr->b_l1hdr.b_pabd = arc_get_data_abd(hdr, size, hdr);
3297 ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL);
3298 }
3299
3300 ARCSTAT_INCR(arcstat_compressed_size, size);
d3c2ae1c
GW
3301 ARCSTAT_INCR(arcstat_uncompressed_size, HDR_GET_LSIZE(hdr));
3302}
3303
3304static void
b5256303 3305arc_hdr_free_abd(arc_buf_hdr_t *hdr, boolean_t free_rdata)
d3c2ae1c 3306{
b5256303
TC
3307 uint64_t size = (free_rdata) ? HDR_GET_PSIZE(hdr) : arc_hdr_size(hdr);
3308
d3c2ae1c 3309 ASSERT(HDR_HAS_L1HDR(hdr));
b5256303
TC
3310 ASSERT(hdr->b_l1hdr.b_pabd != NULL || HDR_HAS_RABD(hdr));
3311 IMPLY(free_rdata, HDR_HAS_RABD(hdr));
d3c2ae1c 3312
ca0bf58d 3313 /*
d3c2ae1c
GW
3314 * If the hdr is currently being written to the l2arc then
3315 * we defer freeing the data by adding it to the l2arc_free_on_write
3316 * list. The l2arc will free the data once it's finished
3317 * writing it to the l2arc device.
ca0bf58d 3318 */
d3c2ae1c 3319 if (HDR_L2_WRITING(hdr)) {
b5256303 3320 arc_hdr_free_on_write(hdr, free_rdata);
d3c2ae1c 3321 ARCSTAT_BUMP(arcstat_l2_free_on_write);
b5256303
TC
3322 } else if (free_rdata) {
3323 arc_free_data_abd(hdr, hdr->b_crypt_hdr.b_rabd, size, hdr);
d3c2ae1c 3324 } else {
b5256303 3325 arc_free_data_abd(hdr, hdr->b_l1hdr.b_pabd, size, hdr);
ca0bf58d
PS
3326 }
3327
b5256303
TC
3328 if (free_rdata) {
3329 hdr->b_crypt_hdr.b_rabd = NULL;
3330 ARCSTAT_INCR(arcstat_raw_size, -size);
3331 } else {
3332 hdr->b_l1hdr.b_pabd = NULL;
3333 }
3334
3335 if (hdr->b_l1hdr.b_pabd == NULL && !HDR_HAS_RABD(hdr))
3336 hdr->b_l1hdr.b_byteswap = DMU_BSWAP_NUMFUNCS;
3337
3338 ARCSTAT_INCR(arcstat_compressed_size, -size);
d3c2ae1c
GW
3339 ARCSTAT_INCR(arcstat_uncompressed_size, -HDR_GET_LSIZE(hdr));
3340}
3341
3342static arc_buf_hdr_t *
3343arc_hdr_alloc(uint64_t spa, int32_t psize, int32_t lsize,
b5256303
TC
3344 boolean_t protected, enum zio_compress compression_type,
3345 arc_buf_contents_t type, boolean_t alloc_rdata)
d3c2ae1c
GW
3346{
3347 arc_buf_hdr_t *hdr;
3348
d3c2ae1c 3349 VERIFY(type == ARC_BUFC_DATA || type == ARC_BUFC_METADATA);
b5256303
TC
3350 if (protected) {
3351 hdr = kmem_cache_alloc(hdr_full_crypt_cache, KM_PUSHPAGE);
3352 } else {
3353 hdr = kmem_cache_alloc(hdr_full_cache, KM_PUSHPAGE);
3354 }
d3c2ae1c 3355
d3c2ae1c
GW
3356 ASSERT(HDR_EMPTY(hdr));
3357 ASSERT3P(hdr->b_l1hdr.b_freeze_cksum, ==, NULL);
3358 HDR_SET_PSIZE(hdr, psize);
3359 HDR_SET_LSIZE(hdr, lsize);
3360 hdr->b_spa = spa;
3361 hdr->b_type = type;
3362 hdr->b_flags = 0;
3363 arc_hdr_set_flags(hdr, arc_bufc_to_flags(type) | ARC_FLAG_HAS_L1HDR);
2aa34383 3364 arc_hdr_set_compress(hdr, compression_type);
b5256303
TC
3365 if (protected)
3366 arc_hdr_set_flags(hdr, ARC_FLAG_PROTECTED);
ca0bf58d 3367
d3c2ae1c
GW
3368 hdr->b_l1hdr.b_state = arc_anon;
3369 hdr->b_l1hdr.b_arc_access = 0;
3370 hdr->b_l1hdr.b_bufcnt = 0;
3371 hdr->b_l1hdr.b_buf = NULL;
ca0bf58d 3372
d3c2ae1c
GW
3373 /*
3374 * Allocate the hdr's buffer. This will contain either
3375 * the compressed or uncompressed data depending on the block
3376 * it references and compressed arc enablement.
3377 */
b5256303 3378 arc_hdr_alloc_abd(hdr, alloc_rdata);
d3c2ae1c 3379 ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
ca0bf58d 3380
d3c2ae1c 3381 return (hdr);
ca0bf58d
PS
3382}
3383
bd089c54 3384/*
d3c2ae1c
GW
3385 * Transition between the two allocation states for the arc_buf_hdr struct.
3386 * The arc_buf_hdr struct can be allocated with (hdr_full_cache) or without
3387 * (hdr_l2only_cache) the fields necessary for the L1 cache - the smaller
3388 * version is used when a cache buffer is only in the L2ARC in order to reduce
3389 * memory usage.
bd089c54 3390 */
d3c2ae1c
GW
3391static arc_buf_hdr_t *
3392arc_hdr_realloc(arc_buf_hdr_t *hdr, kmem_cache_t *old, kmem_cache_t *new)
34dc7c2f 3393{
1c27024e
DB
3394 ASSERT(HDR_HAS_L2HDR(hdr));
3395
d3c2ae1c
GW
3396 arc_buf_hdr_t *nhdr;
3397 l2arc_dev_t *dev = hdr->b_l2hdr.b_dev;
34dc7c2f 3398
d3c2ae1c
GW
3399 ASSERT((old == hdr_full_cache && new == hdr_l2only_cache) ||
3400 (old == hdr_l2only_cache && new == hdr_full_cache));
34dc7c2f 3401
b5256303
TC
3402 /*
3403 * if the caller wanted a new full header and the header is to be
3404 * encrypted we will actually allocate the header from the full crypt
3405 * cache instead. The same applies to freeing from the old cache.
3406 */
3407 if (HDR_PROTECTED(hdr) && new == hdr_full_cache)
3408 new = hdr_full_crypt_cache;
3409 if (HDR_PROTECTED(hdr) && old == hdr_full_cache)
3410 old = hdr_full_crypt_cache;
3411
d3c2ae1c 3412 nhdr = kmem_cache_alloc(new, KM_PUSHPAGE);
428870ff 3413
d3c2ae1c
GW
3414 ASSERT(MUTEX_HELD(HDR_LOCK(hdr)));
3415 buf_hash_remove(hdr);
ca0bf58d 3416
d3c2ae1c 3417 bcopy(hdr, nhdr, HDR_L2ONLY_SIZE);
34dc7c2f 3418
b5256303 3419 if (new == hdr_full_cache || new == hdr_full_crypt_cache) {
d3c2ae1c
GW
3420 arc_hdr_set_flags(nhdr, ARC_FLAG_HAS_L1HDR);
3421 /*
3422 * arc_access and arc_change_state need to be aware that a
3423 * header has just come out of L2ARC, so we set its state to
3424 * l2c_only even though it's about to change.
3425 */
3426 nhdr->b_l1hdr.b_state = arc_l2c_only;
34dc7c2f 3427
d3c2ae1c 3428 /* Verify previous threads set to NULL before freeing */
a6255b7f 3429 ASSERT3P(nhdr->b_l1hdr.b_pabd, ==, NULL);
b5256303 3430 ASSERT(!HDR_HAS_RABD(hdr));
d3c2ae1c
GW
3431 } else {
3432 ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL);
3433 ASSERT0(hdr->b_l1hdr.b_bufcnt);
3434 ASSERT3P(hdr->b_l1hdr.b_freeze_cksum, ==, NULL);
36da08ef 3435
d3c2ae1c
GW
3436 /*
3437 * If we've reached here, We must have been called from
3438 * arc_evict_hdr(), as such we should have already been
3439 * removed from any ghost list we were previously on
3440 * (which protects us from racing with arc_evict_state),
3441 * thus no locking is needed during this check.
3442 */
3443 ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node));
1eb5bfa3
GW
3444
3445 /*
d3c2ae1c
GW
3446 * A buffer must not be moved into the arc_l2c_only
3447 * state if it's not finished being written out to the
a6255b7f 3448 * l2arc device. Otherwise, the b_l1hdr.b_pabd field
d3c2ae1c 3449 * might try to be accessed, even though it was removed.
1eb5bfa3 3450 */
d3c2ae1c 3451 VERIFY(!HDR_L2_WRITING(hdr));
a6255b7f 3452 VERIFY3P(hdr->b_l1hdr.b_pabd, ==, NULL);
b5256303 3453 ASSERT(!HDR_HAS_RABD(hdr));
d3c2ae1c
GW
3454
3455 arc_hdr_clear_flags(nhdr, ARC_FLAG_HAS_L1HDR);
34dc7c2f 3456 }
d3c2ae1c
GW
3457 /*
3458 * The header has been reallocated so we need to re-insert it into any
3459 * lists it was on.
3460 */
3461 (void) buf_hash_insert(nhdr, NULL);
34dc7c2f 3462
d3c2ae1c 3463 ASSERT(list_link_active(&hdr->b_l2hdr.b_l2node));
34dc7c2f 3464
d3c2ae1c
GW
3465 mutex_enter(&dev->l2ad_mtx);
3466
3467 /*
3468 * We must place the realloc'ed header back into the list at
3469 * the same spot. Otherwise, if it's placed earlier in the list,
3470 * l2arc_write_buffers() could find it during the function's
3471 * write phase, and try to write it out to the l2arc.
3472 */
3473 list_insert_after(&dev->l2ad_buflist, hdr, nhdr);
3474 list_remove(&dev->l2ad_buflist, hdr);
34dc7c2f 3475
d3c2ae1c 3476 mutex_exit(&dev->l2ad_mtx);
34dc7c2f 3477
d3c2ae1c
GW
3478 /*
3479 * Since we're using the pointer address as the tag when
3480 * incrementing and decrementing the l2ad_alloc refcount, we
3481 * must remove the old pointer (that we're about to destroy) and
3482 * add the new pointer to the refcount. Otherwise we'd remove
3483 * the wrong pointer address when calling arc_hdr_destroy() later.
3484 */
3485
3486 (void) refcount_remove_many(&dev->l2ad_alloc, arc_hdr_size(hdr), hdr);
3487 (void) refcount_add_many(&dev->l2ad_alloc, arc_hdr_size(nhdr), nhdr);
3488
3489 buf_discard_identity(hdr);
3490 kmem_cache_free(old, hdr);
3491
3492 return (nhdr);
3493}
3494
b5256303
TC
3495/*
3496 * This function allows an L1 header to be reallocated as a crypt
3497 * header and vice versa. If we are going to a crypt header, the
3498 * new fields will be zeroed out.
3499 */
3500static arc_buf_hdr_t *
3501arc_hdr_realloc_crypt(arc_buf_hdr_t *hdr, boolean_t need_crypt)
3502{
3503 arc_buf_hdr_t *nhdr;
3504 arc_buf_t *buf;
3505 kmem_cache_t *ncache, *ocache;
b7ddeaef 3506 unsigned nsize, osize;
b5256303 3507
b7ddeaef
TC
3508 /*
3509 * This function requires that hdr is in the arc_anon state.
3510 * Therefore it won't have any L2ARC data for us to worry
3511 * about copying.
3512 */
b5256303 3513 ASSERT(HDR_HAS_L1HDR(hdr));
b7ddeaef 3514 ASSERT(!HDR_HAS_L2HDR(hdr));
b5256303
TC
3515 ASSERT3U(!!HDR_PROTECTED(hdr), !=, need_crypt);
3516 ASSERT3P(hdr->b_l1hdr.b_state, ==, arc_anon);
3517 ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node));
b7ddeaef
TC
3518 ASSERT(!list_link_active(&hdr->b_l2hdr.b_l2node));
3519 ASSERT3P(hdr->b_hash_next, ==, NULL);
b5256303
TC
3520
3521 if (need_crypt) {
3522 ncache = hdr_full_crypt_cache;
b7ddeaef 3523 nsize = sizeof (hdr->b_crypt_hdr);
b5256303 3524 ocache = hdr_full_cache;
b7ddeaef 3525 osize = HDR_FULL_SIZE;
b5256303
TC
3526 } else {
3527 ncache = hdr_full_cache;
b7ddeaef 3528 nsize = HDR_FULL_SIZE;
b5256303 3529 ocache = hdr_full_crypt_cache;
b7ddeaef 3530 osize = sizeof (hdr->b_crypt_hdr);
b5256303
TC
3531 }
3532
3533 nhdr = kmem_cache_alloc(ncache, KM_PUSHPAGE);
b7ddeaef
TC
3534
3535 /*
3536 * Copy all members that aren't locks or condvars to the new header.
3537 * No lists are pointing to us (as we asserted above), so we don't
3538 * need to worry about the list nodes.
3539 */
3540 nhdr->b_dva = hdr->b_dva;
3541 nhdr->b_birth = hdr->b_birth;
3542 nhdr->b_type = hdr->b_type;
3543 nhdr->b_flags = hdr->b_flags;
3544 nhdr->b_psize = hdr->b_psize;
3545 nhdr->b_lsize = hdr->b_lsize;
3546 nhdr->b_spa = hdr->b_spa;
b5256303
TC
3547 nhdr->b_l1hdr.b_freeze_cksum = hdr->b_l1hdr.b_freeze_cksum;
3548 nhdr->b_l1hdr.b_bufcnt = hdr->b_l1hdr.b_bufcnt;
3549 nhdr->b_l1hdr.b_byteswap = hdr->b_l1hdr.b_byteswap;
3550 nhdr->b_l1hdr.b_state = hdr->b_l1hdr.b_state;
3551 nhdr->b_l1hdr.b_arc_access = hdr->b_l1hdr.b_arc_access;
3552 nhdr->b_l1hdr.b_mru_hits = hdr->b_l1hdr.b_mru_hits;
3553 nhdr->b_l1hdr.b_mru_ghost_hits = hdr->b_l1hdr.b_mru_ghost_hits;
3554 nhdr->b_l1hdr.b_mfu_hits = hdr->b_l1hdr.b_mfu_hits;
3555 nhdr->b_l1hdr.b_mfu_ghost_hits = hdr->b_l1hdr.b_mfu_ghost_hits;
3556 nhdr->b_l1hdr.b_l2_hits = hdr->b_l1hdr.b_l2_hits;
3557 nhdr->b_l1hdr.b_acb = hdr->b_l1hdr.b_acb;
3558 nhdr->b_l1hdr.b_pabd = hdr->b_l1hdr.b_pabd;
b5256303
TC
3559
3560 /*
3561 * This refcount_add() exists only to ensure that the individual
3562 * arc buffers always point to a header that is referenced, avoiding
3563 * a small race condition that could trigger ASSERTs.
3564 */
3565 (void) refcount_add(&nhdr->b_l1hdr.b_refcnt, FTAG);
b7ddeaef 3566 nhdr->b_l1hdr.b_buf = hdr->b_l1hdr.b_buf;
b5256303
TC
3567 for (buf = nhdr->b_l1hdr.b_buf; buf != NULL; buf = buf->b_next) {
3568 mutex_enter(&buf->b_evict_lock);
3569 buf->b_hdr = nhdr;
3570 mutex_exit(&buf->b_evict_lock);
3571 }
3572
3573 refcount_transfer(&nhdr->b_l1hdr.b_refcnt, &hdr->b_l1hdr.b_refcnt);
3574 (void) refcount_remove(&nhdr->b_l1hdr.b_refcnt, FTAG);
b7ddeaef 3575 ASSERT0(refcount_count(&hdr->b_l1hdr.b_refcnt));
b5256303
TC
3576
3577 if (need_crypt) {
3578 arc_hdr_set_flags(nhdr, ARC_FLAG_PROTECTED);
3579 } else {
3580 arc_hdr_clear_flags(nhdr, ARC_FLAG_PROTECTED);
3581 }
3582
b7ddeaef
TC
3583 /* unset all members of the original hdr */
3584 bzero(&hdr->b_dva, sizeof (dva_t));
3585 hdr->b_birth = 0;
3586 hdr->b_type = ARC_BUFC_INVALID;
3587 hdr->b_flags = 0;
3588 hdr->b_psize = 0;
3589 hdr->b_lsize = 0;
3590 hdr->b_spa = 0;
3591 hdr->b_l1hdr.b_freeze_cksum = NULL;
3592 hdr->b_l1hdr.b_buf = NULL;
3593 hdr->b_l1hdr.b_bufcnt = 0;
3594 hdr->b_l1hdr.b_byteswap = 0;
3595 hdr->b_l1hdr.b_state = NULL;
3596 hdr->b_l1hdr.b_arc_access = 0;
3597 hdr->b_l1hdr.b_mru_hits = 0;
3598 hdr->b_l1hdr.b_mru_ghost_hits = 0;
3599 hdr->b_l1hdr.b_mfu_hits = 0;
3600 hdr->b_l1hdr.b_mfu_ghost_hits = 0;
3601 hdr->b_l1hdr.b_l2_hits = 0;
3602 hdr->b_l1hdr.b_acb = NULL;
3603 hdr->b_l1hdr.b_pabd = NULL;
3604
3605 if (ocache == hdr_full_crypt_cache) {
3606 ASSERT(!HDR_HAS_RABD(hdr));
3607 hdr->b_crypt_hdr.b_ot = DMU_OT_NONE;
3608 hdr->b_crypt_hdr.b_ebufcnt = 0;
3609 hdr->b_crypt_hdr.b_dsobj = 0;
3610 bzero(hdr->b_crypt_hdr.b_salt, ZIO_DATA_SALT_LEN);
3611 bzero(hdr->b_crypt_hdr.b_iv, ZIO_DATA_IV_LEN);
3612 bzero(hdr->b_crypt_hdr.b_mac, ZIO_DATA_MAC_LEN);
3613 }
3614
b5256303
TC
3615 buf_discard_identity(hdr);
3616 kmem_cache_free(ocache, hdr);
3617
3618 return (nhdr);
3619}
3620
3621/*
3622 * This function is used by the send / receive code to convert a newly
3623 * allocated arc_buf_t to one that is suitable for a raw encrypted write. It
3624 * is also used to allow the root objset block to be uupdated without altering
3625 * its embedded MACs. Both block types will always be uncompressed so we do not
3626 * have to worry about compression type or psize.
3627 */
3628void
3629arc_convert_to_raw(arc_buf_t *buf, uint64_t dsobj, boolean_t byteorder,
3630 dmu_object_type_t ot, const uint8_t *salt, const uint8_t *iv,
3631 const uint8_t *mac)
3632{
3633 arc_buf_hdr_t *hdr = buf->b_hdr;
3634
3635 ASSERT(ot == DMU_OT_DNODE || ot == DMU_OT_OBJSET);
3636 ASSERT(HDR_HAS_L1HDR(hdr));
3637 ASSERT3P(hdr->b_l1hdr.b_state, ==, arc_anon);
3638
3639 buf->b_flags |= (ARC_BUF_FLAG_COMPRESSED | ARC_BUF_FLAG_ENCRYPTED);
3640 if (!HDR_PROTECTED(hdr))
3641 hdr = arc_hdr_realloc_crypt(hdr, B_TRUE);
3642 hdr->b_crypt_hdr.b_dsobj = dsobj;
3643 hdr->b_crypt_hdr.b_ot = ot;
3644 hdr->b_l1hdr.b_byteswap = (byteorder == ZFS_HOST_BYTEORDER) ?
3645 DMU_BSWAP_NUMFUNCS : DMU_OT_BYTESWAP(ot);
3646 if (!arc_hdr_has_uncompressed_buf(hdr))
3647 arc_cksum_free(hdr);
3648
3649 if (salt != NULL)
3650 bcopy(salt, hdr->b_crypt_hdr.b_salt, ZIO_DATA_SALT_LEN);
3651 if (iv != NULL)
3652 bcopy(iv, hdr->b_crypt_hdr.b_iv, ZIO_DATA_IV_LEN);
3653 if (mac != NULL)
3654 bcopy(mac, hdr->b_crypt_hdr.b_mac, ZIO_DATA_MAC_LEN);
3655}
3656
d3c2ae1c
GW
3657/*
3658 * Allocate a new arc_buf_hdr_t and arc_buf_t and return the buf to the caller.
3659 * The buf is returned thawed since we expect the consumer to modify it.
3660 */
3661arc_buf_t *
2aa34383 3662arc_alloc_buf(spa_t *spa, void *tag, arc_buf_contents_t type, int32_t size)
d3c2ae1c 3663{
d3c2ae1c 3664 arc_buf_hdr_t *hdr = arc_hdr_alloc(spa_load_guid(spa), size, size,
b5256303 3665 B_FALSE, ZIO_COMPRESS_OFF, type, B_FALSE);
d3c2ae1c 3666 ASSERT(!MUTEX_HELD(HDR_LOCK(hdr)));
2aa34383 3667
a7004725 3668 arc_buf_t *buf = NULL;
be9a5c35 3669 VERIFY0(arc_buf_alloc_impl(hdr, spa, NULL, tag, B_FALSE, B_FALSE,
b5256303 3670 B_FALSE, B_FALSE, &buf));
d3c2ae1c 3671 arc_buf_thaw(buf);
2aa34383
DK
3672
3673 return (buf);
3674}
3675
3676/*
3677 * Allocate a compressed buf in the same manner as arc_alloc_buf. Don't use this
3678 * for bufs containing metadata.
3679 */
3680arc_buf_t *
3681arc_alloc_compressed_buf(spa_t *spa, void *tag, uint64_t psize, uint64_t lsize,
3682 enum zio_compress compression_type)
3683{
2aa34383
DK
3684 ASSERT3U(lsize, >, 0);
3685 ASSERT3U(lsize, >=, psize);
b5256303
TC
3686 ASSERT3U(compression_type, >, ZIO_COMPRESS_OFF);
3687 ASSERT3U(compression_type, <, ZIO_COMPRESS_FUNCTIONS);
2aa34383 3688
a7004725 3689 arc_buf_hdr_t *hdr = arc_hdr_alloc(spa_load_guid(spa), psize, lsize,
b5256303 3690 B_FALSE, compression_type, ARC_BUFC_DATA, B_FALSE);
2aa34383
DK
3691 ASSERT(!MUTEX_HELD(HDR_LOCK(hdr)));
3692
a7004725 3693 arc_buf_t *buf = NULL;
be9a5c35 3694 VERIFY0(arc_buf_alloc_impl(hdr, spa, NULL, tag, B_FALSE,
b5256303 3695 B_TRUE, B_FALSE, B_FALSE, &buf));
2aa34383
DK
3696 arc_buf_thaw(buf);
3697 ASSERT3P(hdr->b_l1hdr.b_freeze_cksum, ==, NULL);
3698
a6255b7f
DQ
3699 if (!arc_buf_is_shared(buf)) {
3700 /*
3701 * To ensure that the hdr has the correct data in it if we call
b5256303 3702 * arc_untransform() on this buf before it's been written to
a6255b7f
DQ
3703 * disk, it's easiest if we just set up sharing between the
3704 * buf and the hdr.
3705 */
3706 ASSERT(!abd_is_linear(hdr->b_l1hdr.b_pabd));
b5256303 3707 arc_hdr_free_abd(hdr, B_FALSE);
a6255b7f
DQ
3708 arc_share_buf(hdr, buf);
3709 }
3710
d3c2ae1c 3711 return (buf);
34dc7c2f
BB
3712}
3713
b5256303
TC
3714arc_buf_t *
3715arc_alloc_raw_buf(spa_t *spa, void *tag, uint64_t dsobj, boolean_t byteorder,
3716 const uint8_t *salt, const uint8_t *iv, const uint8_t *mac,
3717 dmu_object_type_t ot, uint64_t psize, uint64_t lsize,
3718 enum zio_compress compression_type)
3719{
3720 arc_buf_hdr_t *hdr;
3721 arc_buf_t *buf;
3722 arc_buf_contents_t type = DMU_OT_IS_METADATA(ot) ?
3723 ARC_BUFC_METADATA : ARC_BUFC_DATA;
3724
3725 ASSERT3U(lsize, >, 0);
3726 ASSERT3U(lsize, >=, psize);
3727 ASSERT3U(compression_type, >=, ZIO_COMPRESS_OFF);
3728 ASSERT3U(compression_type, <, ZIO_COMPRESS_FUNCTIONS);
3729
3730 hdr = arc_hdr_alloc(spa_load_guid(spa), psize, lsize, B_TRUE,
3731 compression_type, type, B_TRUE);
3732 ASSERT(!MUTEX_HELD(HDR_LOCK(hdr)));
3733
3734 hdr->b_crypt_hdr.b_dsobj = dsobj;
3735 hdr->b_crypt_hdr.b_ot = ot;
3736 hdr->b_l1hdr.b_byteswap = (byteorder == ZFS_HOST_BYTEORDER) ?
3737 DMU_BSWAP_NUMFUNCS : DMU_OT_BYTESWAP(ot);
3738 bcopy(salt, hdr->b_crypt_hdr.b_salt, ZIO_DATA_SALT_LEN);
3739 bcopy(iv, hdr->b_crypt_hdr.b_iv, ZIO_DATA_IV_LEN);
3740 bcopy(mac, hdr->b_crypt_hdr.b_mac, ZIO_DATA_MAC_LEN);
3741
3742 /*
3743 * This buffer will be considered encrypted even if the ot is not an
3744 * encrypted type. It will become authenticated instead in
3745 * arc_write_ready().
3746 */
3747 buf = NULL;
be9a5c35 3748 VERIFY0(arc_buf_alloc_impl(hdr, spa, NULL, tag, B_TRUE, B_TRUE,
b5256303
TC
3749 B_FALSE, B_FALSE, &buf));
3750 arc_buf_thaw(buf);
3751 ASSERT3P(hdr->b_l1hdr.b_freeze_cksum, ==, NULL);
3752
3753 return (buf);
3754}
3755
d962d5da
PS
3756static void
3757arc_hdr_l2hdr_destroy(arc_buf_hdr_t *hdr)
3758{
3759 l2arc_buf_hdr_t *l2hdr = &hdr->b_l2hdr;
3760 l2arc_dev_t *dev = l2hdr->b_dev;
01850391 3761 uint64_t psize = arc_hdr_size(hdr);
d962d5da
PS
3762
3763 ASSERT(MUTEX_HELD(&dev->l2ad_mtx));
3764 ASSERT(HDR_HAS_L2HDR(hdr));
3765
3766 list_remove(&dev->l2ad_buflist, hdr);
3767
01850391
AG
3768 ARCSTAT_INCR(arcstat_l2_psize, -psize);
3769 ARCSTAT_INCR(arcstat_l2_lsize, -HDR_GET_LSIZE(hdr));
d962d5da 3770
01850391 3771 vdev_space_update(dev->l2ad_vdev, -psize, 0, 0);
d962d5da 3772
01850391 3773 (void) refcount_remove_many(&dev->l2ad_alloc, psize, hdr);
d3c2ae1c 3774 arc_hdr_clear_flags(hdr, ARC_FLAG_HAS_L2HDR);
d962d5da
PS
3775}
3776
34dc7c2f
BB
3777static void
3778arc_hdr_destroy(arc_buf_hdr_t *hdr)
3779{
b9541d6b
CW
3780 if (HDR_HAS_L1HDR(hdr)) {
3781 ASSERT(hdr->b_l1hdr.b_buf == NULL ||
d3c2ae1c 3782 hdr->b_l1hdr.b_bufcnt > 0);
b9541d6b
CW
3783 ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
3784 ASSERT3P(hdr->b_l1hdr.b_state, ==, arc_anon);
3785 }
34dc7c2f 3786 ASSERT(!HDR_IO_IN_PROGRESS(hdr));
b9541d6b
CW
3787 ASSERT(!HDR_IN_HASH_TABLE(hdr));
3788
d3c2ae1c
GW
3789 if (!HDR_EMPTY(hdr))
3790 buf_discard_identity(hdr);
3791
b9541d6b 3792 if (HDR_HAS_L2HDR(hdr)) {
d962d5da
PS
3793 l2arc_dev_t *dev = hdr->b_l2hdr.b_dev;
3794 boolean_t buflist_held = MUTEX_HELD(&dev->l2ad_mtx);
428870ff 3795
d962d5da
PS
3796 if (!buflist_held)
3797 mutex_enter(&dev->l2ad_mtx);
b9541d6b 3798
ca0bf58d 3799 /*
d962d5da
PS
3800 * Even though we checked this conditional above, we
3801 * need to check this again now that we have the
3802 * l2ad_mtx. This is because we could be racing with
3803 * another thread calling l2arc_evict() which might have
3804 * destroyed this header's L2 portion as we were waiting
3805 * to acquire the l2ad_mtx. If that happens, we don't
3806 * want to re-destroy the header's L2 portion.
ca0bf58d 3807 */
d962d5da
PS
3808 if (HDR_HAS_L2HDR(hdr))
3809 arc_hdr_l2hdr_destroy(hdr);
428870ff
BB
3810
3811 if (!buflist_held)
d962d5da 3812 mutex_exit(&dev->l2ad_mtx);
34dc7c2f
BB
3813 }
3814
d3c2ae1c
GW
3815 if (HDR_HAS_L1HDR(hdr)) {
3816 arc_cksum_free(hdr);
b9541d6b 3817
d3c2ae1c 3818 while (hdr->b_l1hdr.b_buf != NULL)
2aa34383 3819 arc_buf_destroy_impl(hdr->b_l1hdr.b_buf);
34dc7c2f 3820
b5256303
TC
3821 if (hdr->b_l1hdr.b_pabd != NULL) {
3822 arc_hdr_free_abd(hdr, B_FALSE);
3823 }
3824
440a3eb9 3825 if (HDR_HAS_RABD(hdr))
b5256303 3826 arc_hdr_free_abd(hdr, B_TRUE);
b9541d6b
CW
3827 }
3828
34dc7c2f 3829 ASSERT3P(hdr->b_hash_next, ==, NULL);
b9541d6b 3830 if (HDR_HAS_L1HDR(hdr)) {
ca0bf58d 3831 ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node));
b9541d6b 3832 ASSERT3P(hdr->b_l1hdr.b_acb, ==, NULL);
b5256303
TC
3833
3834 if (!HDR_PROTECTED(hdr)) {
3835 kmem_cache_free(hdr_full_cache, hdr);
3836 } else {
3837 kmem_cache_free(hdr_full_crypt_cache, hdr);
3838 }
b9541d6b
CW
3839 } else {
3840 kmem_cache_free(hdr_l2only_cache, hdr);
3841 }
34dc7c2f
BB
3842}
3843
3844void
d3c2ae1c 3845arc_buf_destroy(arc_buf_t *buf, void* tag)
34dc7c2f
BB
3846{
3847 arc_buf_hdr_t *hdr = buf->b_hdr;
96c080cb 3848 kmutex_t *hash_lock = HDR_LOCK(hdr);
34dc7c2f 3849
b9541d6b 3850 if (hdr->b_l1hdr.b_state == arc_anon) {
d3c2ae1c
GW
3851 ASSERT3U(hdr->b_l1hdr.b_bufcnt, ==, 1);
3852 ASSERT(!HDR_IO_IN_PROGRESS(hdr));
3853 VERIFY0(remove_reference(hdr, NULL, tag));
3854 arc_hdr_destroy(hdr);
3855 return;
34dc7c2f
BB
3856 }
3857
3858 mutex_enter(hash_lock);
d3c2ae1c
GW
3859 ASSERT3P(hdr, ==, buf->b_hdr);
3860 ASSERT(hdr->b_l1hdr.b_bufcnt > 0);
428870ff 3861 ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
d3c2ae1c
GW
3862 ASSERT3P(hdr->b_l1hdr.b_state, !=, arc_anon);
3863 ASSERT3P(buf->b_data, !=, NULL);
34dc7c2f
BB
3864
3865 (void) remove_reference(hdr, hash_lock, tag);
2aa34383 3866 arc_buf_destroy_impl(buf);
34dc7c2f 3867 mutex_exit(hash_lock);
34dc7c2f
BB
3868}
3869
34dc7c2f 3870/*
ca0bf58d
PS
3871 * Evict the arc_buf_hdr that is provided as a parameter. The resultant
3872 * state of the header is dependent on its state prior to entering this
3873 * function. The following transitions are possible:
34dc7c2f 3874 *
ca0bf58d
PS
3875 * - arc_mru -> arc_mru_ghost
3876 * - arc_mfu -> arc_mfu_ghost
3877 * - arc_mru_ghost -> arc_l2c_only
3878 * - arc_mru_ghost -> deleted
3879 * - arc_mfu_ghost -> arc_l2c_only
3880 * - arc_mfu_ghost -> deleted
34dc7c2f 3881 */
ca0bf58d
PS
3882static int64_t
3883arc_evict_hdr(arc_buf_hdr_t *hdr, kmutex_t *hash_lock)
34dc7c2f 3884{
ca0bf58d
PS
3885 arc_state_t *evicted_state, *state;
3886 int64_t bytes_evicted = 0;
d4a72f23
TC
3887 int min_lifetime = HDR_PRESCIENT_PREFETCH(hdr) ?
3888 arc_min_prescient_prefetch_ms : arc_min_prefetch_ms;
34dc7c2f 3889
ca0bf58d
PS
3890 ASSERT(MUTEX_HELD(hash_lock));
3891 ASSERT(HDR_HAS_L1HDR(hdr));
e8b96c60 3892
ca0bf58d
PS
3893 state = hdr->b_l1hdr.b_state;
3894 if (GHOST_STATE(state)) {
3895 ASSERT(!HDR_IO_IN_PROGRESS(hdr));
d3c2ae1c 3896 ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL);
e8b96c60
MA
3897
3898 /*
ca0bf58d 3899 * l2arc_write_buffers() relies on a header's L1 portion
a6255b7f 3900 * (i.e. its b_pabd field) during it's write phase.
ca0bf58d
PS
3901 * Thus, we cannot push a header onto the arc_l2c_only
3902 * state (removing its L1 piece) until the header is
3903 * done being written to the l2arc.
e8b96c60 3904 */
ca0bf58d
PS
3905 if (HDR_HAS_L2HDR(hdr) && HDR_L2_WRITING(hdr)) {
3906 ARCSTAT_BUMP(arcstat_evict_l2_skip);
3907 return (bytes_evicted);
e8b96c60
MA
3908 }
3909
ca0bf58d 3910 ARCSTAT_BUMP(arcstat_deleted);
d3c2ae1c 3911 bytes_evicted += HDR_GET_LSIZE(hdr);
428870ff 3912
ca0bf58d 3913 DTRACE_PROBE1(arc__delete, arc_buf_hdr_t *, hdr);
428870ff 3914
ca0bf58d 3915 if (HDR_HAS_L2HDR(hdr)) {
a6255b7f 3916 ASSERT(hdr->b_l1hdr.b_pabd == NULL);
b5256303 3917 ASSERT(!HDR_HAS_RABD(hdr));
ca0bf58d
PS
3918 /*
3919 * This buffer is cached on the 2nd Level ARC;
3920 * don't destroy the header.
3921 */
3922 arc_change_state(arc_l2c_only, hdr, hash_lock);
3923 /*
3924 * dropping from L1+L2 cached to L2-only,
3925 * realloc to remove the L1 header.
3926 */
3927 hdr = arc_hdr_realloc(hdr, hdr_full_cache,
3928 hdr_l2only_cache);
34dc7c2f 3929 } else {
ca0bf58d
PS
3930 arc_change_state(arc_anon, hdr, hash_lock);
3931 arc_hdr_destroy(hdr);
34dc7c2f 3932 }
ca0bf58d 3933 return (bytes_evicted);
34dc7c2f
BB
3934 }
3935
ca0bf58d
PS
3936 ASSERT(state == arc_mru || state == arc_mfu);
3937 evicted_state = (state == arc_mru) ? arc_mru_ghost : arc_mfu_ghost;
34dc7c2f 3938
ca0bf58d
PS
3939 /* prefetch buffers have a minimum lifespan */
3940 if (HDR_IO_IN_PROGRESS(hdr) ||
3941 ((hdr->b_flags & (ARC_FLAG_PREFETCH | ARC_FLAG_INDIRECT)) &&
2b84817f
TC
3942 ddi_get_lbolt() - hdr->b_l1hdr.b_arc_access <
3943 MSEC_TO_TICK(min_lifetime))) {
ca0bf58d
PS
3944 ARCSTAT_BUMP(arcstat_evict_skip);
3945 return (bytes_evicted);
da8ccd0e
PS
3946 }
3947
ca0bf58d 3948 ASSERT0(refcount_count(&hdr->b_l1hdr.b_refcnt));
ca0bf58d
PS
3949 while (hdr->b_l1hdr.b_buf) {
3950 arc_buf_t *buf = hdr->b_l1hdr.b_buf;
3951 if (!mutex_tryenter(&buf->b_evict_lock)) {
3952 ARCSTAT_BUMP(arcstat_mutex_miss);
3953 break;
3954 }
3955 if (buf->b_data != NULL)
d3c2ae1c
GW
3956 bytes_evicted += HDR_GET_LSIZE(hdr);
3957 mutex_exit(&buf->b_evict_lock);
2aa34383 3958 arc_buf_destroy_impl(buf);
ca0bf58d 3959 }
34dc7c2f 3960
ca0bf58d 3961 if (HDR_HAS_L2HDR(hdr)) {
d3c2ae1c 3962 ARCSTAT_INCR(arcstat_evict_l2_cached, HDR_GET_LSIZE(hdr));
ca0bf58d 3963 } else {
d3c2ae1c
GW
3964 if (l2arc_write_eligible(hdr->b_spa, hdr)) {
3965 ARCSTAT_INCR(arcstat_evict_l2_eligible,
3966 HDR_GET_LSIZE(hdr));
3967 } else {
3968 ARCSTAT_INCR(arcstat_evict_l2_ineligible,
3969 HDR_GET_LSIZE(hdr));
3970 }
ca0bf58d 3971 }
34dc7c2f 3972
d3c2ae1c
GW
3973 if (hdr->b_l1hdr.b_bufcnt == 0) {
3974 arc_cksum_free(hdr);
3975
3976 bytes_evicted += arc_hdr_size(hdr);
3977
3978 /*
3979 * If this hdr is being evicted and has a compressed
3980 * buffer then we discard it here before we change states.
3981 * This ensures that the accounting is updated correctly
a6255b7f 3982 * in arc_free_data_impl().
d3c2ae1c 3983 */
b5256303
TC
3984 if (hdr->b_l1hdr.b_pabd != NULL)
3985 arc_hdr_free_abd(hdr, B_FALSE);
3986
3987 if (HDR_HAS_RABD(hdr))
3988 arc_hdr_free_abd(hdr, B_TRUE);
d3c2ae1c 3989
ca0bf58d
PS
3990 arc_change_state(evicted_state, hdr, hash_lock);
3991 ASSERT(HDR_IN_HASH_TABLE(hdr));
d3c2ae1c 3992 arc_hdr_set_flags(hdr, ARC_FLAG_IN_HASH_TABLE);
ca0bf58d
PS
3993 DTRACE_PROBE1(arc__evict, arc_buf_hdr_t *, hdr);
3994 }
34dc7c2f 3995
ca0bf58d 3996 return (bytes_evicted);
34dc7c2f
BB
3997}
3998
ca0bf58d
PS
3999static uint64_t
4000arc_evict_state_impl(multilist_t *ml, int idx, arc_buf_hdr_t *marker,
4001 uint64_t spa, int64_t bytes)
34dc7c2f 4002{
ca0bf58d
PS
4003 multilist_sublist_t *mls;
4004 uint64_t bytes_evicted = 0;
4005 arc_buf_hdr_t *hdr;
34dc7c2f 4006 kmutex_t *hash_lock;
ca0bf58d 4007 int evict_count = 0;
34dc7c2f 4008
ca0bf58d 4009 ASSERT3P(marker, !=, NULL);
96c080cb 4010 IMPLY(bytes < 0, bytes == ARC_EVICT_ALL);
ca0bf58d
PS
4011
4012 mls = multilist_sublist_lock(ml, idx);
572e2857 4013
ca0bf58d
PS
4014 for (hdr = multilist_sublist_prev(mls, marker); hdr != NULL;
4015 hdr = multilist_sublist_prev(mls, marker)) {
4016 if ((bytes != ARC_EVICT_ALL && bytes_evicted >= bytes) ||
4017 (evict_count >= zfs_arc_evict_batch_limit))
4018 break;
4019
4020 /*
4021 * To keep our iteration location, move the marker
4022 * forward. Since we're not holding hdr's hash lock, we
4023 * must be very careful and not remove 'hdr' from the
4024 * sublist. Otherwise, other consumers might mistake the
4025 * 'hdr' as not being on a sublist when they call the
4026 * multilist_link_active() function (they all rely on
4027 * the hash lock protecting concurrent insertions and
4028 * removals). multilist_sublist_move_forward() was
4029 * specifically implemented to ensure this is the case
4030 * (only 'marker' will be removed and re-inserted).
4031 */
4032 multilist_sublist_move_forward(mls, marker);
4033
4034 /*
4035 * The only case where the b_spa field should ever be
4036 * zero, is the marker headers inserted by
4037 * arc_evict_state(). It's possible for multiple threads
4038 * to be calling arc_evict_state() concurrently (e.g.
4039 * dsl_pool_close() and zio_inject_fault()), so we must
4040 * skip any markers we see from these other threads.
4041 */
2a432414 4042 if (hdr->b_spa == 0)
572e2857
BB
4043 continue;
4044
ca0bf58d
PS
4045 /* we're only interested in evicting buffers of a certain spa */
4046 if (spa != 0 && hdr->b_spa != spa) {
4047 ARCSTAT_BUMP(arcstat_evict_skip);
428870ff 4048 continue;
ca0bf58d
PS
4049 }
4050
4051 hash_lock = HDR_LOCK(hdr);
e8b96c60
MA
4052
4053 /*
ca0bf58d
PS
4054 * We aren't calling this function from any code path
4055 * that would already be holding a hash lock, so we're
4056 * asserting on this assumption to be defensive in case
4057 * this ever changes. Without this check, it would be
4058 * possible to incorrectly increment arcstat_mutex_miss
4059 * below (e.g. if the code changed such that we called
4060 * this function with a hash lock held).
e8b96c60 4061 */
ca0bf58d
PS
4062 ASSERT(!MUTEX_HELD(hash_lock));
4063
34dc7c2f 4064 if (mutex_tryenter(hash_lock)) {
ca0bf58d
PS
4065 uint64_t evicted = arc_evict_hdr(hdr, hash_lock);
4066 mutex_exit(hash_lock);
34dc7c2f 4067
ca0bf58d 4068 bytes_evicted += evicted;
34dc7c2f 4069
572e2857 4070 /*
ca0bf58d
PS
4071 * If evicted is zero, arc_evict_hdr() must have
4072 * decided to skip this header, don't increment
4073 * evict_count in this case.
572e2857 4074 */
ca0bf58d
PS
4075 if (evicted != 0)
4076 evict_count++;
4077
4078 /*
4079 * If arc_size isn't overflowing, signal any
4080 * threads that might happen to be waiting.
4081 *
4082 * For each header evicted, we wake up a single
4083 * thread. If we used cv_broadcast, we could
4084 * wake up "too many" threads causing arc_size
4085 * to significantly overflow arc_c; since
a6255b7f 4086 * arc_get_data_impl() doesn't check for overflow
ca0bf58d
PS
4087 * when it's woken up (it doesn't because it's
4088 * possible for the ARC to be overflowing while
4089 * full of un-evictable buffers, and the
4090 * function should proceed in this case).
4091 *
4092 * If threads are left sleeping, due to not
4093 * using cv_broadcast, they will be woken up
4094 * just before arc_reclaim_thread() sleeps.
4095 */
4096 mutex_enter(&arc_reclaim_lock);
4097 if (!arc_is_overflowing())
4098 cv_signal(&arc_reclaim_waiters_cv);
4099 mutex_exit(&arc_reclaim_lock);
e8b96c60 4100 } else {
ca0bf58d 4101 ARCSTAT_BUMP(arcstat_mutex_miss);
e8b96c60 4102 }
34dc7c2f 4103 }
34dc7c2f 4104
ca0bf58d 4105 multilist_sublist_unlock(mls);
34dc7c2f 4106
ca0bf58d 4107 return (bytes_evicted);
34dc7c2f
BB
4108}
4109
ca0bf58d
PS
4110/*
4111 * Evict buffers from the given arc state, until we've removed the
4112 * specified number of bytes. Move the removed buffers to the
4113 * appropriate evict state.
4114 *
4115 * This function makes a "best effort". It skips over any buffers
4116 * it can't get a hash_lock on, and so, may not catch all candidates.
4117 * It may also return without evicting as much space as requested.
4118 *
4119 * If bytes is specified using the special value ARC_EVICT_ALL, this
4120 * will evict all available (i.e. unlocked and evictable) buffers from
4121 * the given arc state; which is used by arc_flush().
4122 */
4123static uint64_t
4124arc_evict_state(arc_state_t *state, uint64_t spa, int64_t bytes,
4125 arc_buf_contents_t type)
34dc7c2f 4126{
ca0bf58d 4127 uint64_t total_evicted = 0;
64fc7762 4128 multilist_t *ml = state->arcs_list[type];
ca0bf58d
PS
4129 int num_sublists;
4130 arc_buf_hdr_t **markers;
ca0bf58d 4131
96c080cb 4132 IMPLY(bytes < 0, bytes == ARC_EVICT_ALL);
ca0bf58d
PS
4133
4134 num_sublists = multilist_get_num_sublists(ml);
d164b209
BB
4135
4136 /*
ca0bf58d
PS
4137 * If we've tried to evict from each sublist, made some
4138 * progress, but still have not hit the target number of bytes
4139 * to evict, we want to keep trying. The markers allow us to
4140 * pick up where we left off for each individual sublist, rather
4141 * than starting from the tail each time.
d164b209 4142 */
ca0bf58d 4143 markers = kmem_zalloc(sizeof (*markers) * num_sublists, KM_SLEEP);
1c27024e 4144 for (int i = 0; i < num_sublists; i++) {
ca0bf58d 4145 multilist_sublist_t *mls;
34dc7c2f 4146
ca0bf58d
PS
4147 markers[i] = kmem_cache_alloc(hdr_full_cache, KM_SLEEP);
4148
4149 /*
4150 * A b_spa of 0 is used to indicate that this header is
4151 * a marker. This fact is used in arc_adjust_type() and
4152 * arc_evict_state_impl().
4153 */
4154 markers[i]->b_spa = 0;
34dc7c2f 4155
ca0bf58d
PS
4156 mls = multilist_sublist_lock(ml, i);
4157 multilist_sublist_insert_tail(mls, markers[i]);
4158 multilist_sublist_unlock(mls);
34dc7c2f
BB
4159 }
4160
d164b209 4161 /*
ca0bf58d
PS
4162 * While we haven't hit our target number of bytes to evict, or
4163 * we're evicting all available buffers.
d164b209 4164 */
ca0bf58d 4165 while (total_evicted < bytes || bytes == ARC_EVICT_ALL) {
25458cbe
TC
4166 int sublist_idx = multilist_get_random_index(ml);
4167 uint64_t scan_evicted = 0;
4168
4169 /*
4170 * Try to reduce pinned dnodes with a floor of arc_dnode_limit.
4171 * Request that 10% of the LRUs be scanned by the superblock
4172 * shrinker.
4173 */
37fb3e43
PD
4174 if (type == ARC_BUFC_DATA && aggsum_compare(&astat_dnode_size,
4175 arc_dnode_limit) > 0) {
4176 arc_prune_async((aggsum_upper_bound(&astat_dnode_size) -
4177 arc_dnode_limit) / sizeof (dnode_t) /
4178 zfs_arc_dnode_reduce_percent);
4179 }
25458cbe 4180
ca0bf58d
PS
4181 /*
4182 * Start eviction using a randomly selected sublist,
4183 * this is to try and evenly balance eviction across all
4184 * sublists. Always starting at the same sublist
4185 * (e.g. index 0) would cause evictions to favor certain
4186 * sublists over others.
4187 */
1c27024e 4188 for (int i = 0; i < num_sublists; i++) {
ca0bf58d
PS
4189 uint64_t bytes_remaining;
4190 uint64_t bytes_evicted;
d164b209 4191
ca0bf58d
PS
4192 if (bytes == ARC_EVICT_ALL)
4193 bytes_remaining = ARC_EVICT_ALL;
4194 else if (total_evicted < bytes)
4195 bytes_remaining = bytes - total_evicted;
4196 else
4197 break;
34dc7c2f 4198
ca0bf58d
PS
4199 bytes_evicted = arc_evict_state_impl(ml, sublist_idx,
4200 markers[sublist_idx], spa, bytes_remaining);
4201
4202 scan_evicted += bytes_evicted;
4203 total_evicted += bytes_evicted;
4204
4205 /* we've reached the end, wrap to the beginning */
4206 if (++sublist_idx >= num_sublists)
4207 sublist_idx = 0;
4208 }
4209
4210 /*
4211 * If we didn't evict anything during this scan, we have
4212 * no reason to believe we'll evict more during another
4213 * scan, so break the loop.
4214 */
4215 if (scan_evicted == 0) {
4216 /* This isn't possible, let's make that obvious */
4217 ASSERT3S(bytes, !=, 0);
34dc7c2f 4218
ca0bf58d
PS
4219 /*
4220 * When bytes is ARC_EVICT_ALL, the only way to
4221 * break the loop is when scan_evicted is zero.
4222 * In that case, we actually have evicted enough,
4223 * so we don't want to increment the kstat.
4224 */
4225 if (bytes != ARC_EVICT_ALL) {
4226 ASSERT3S(total_evicted, <, bytes);
4227 ARCSTAT_BUMP(arcstat_evict_not_enough);
4228 }
d164b209 4229
ca0bf58d
PS
4230 break;
4231 }
d164b209 4232 }
34dc7c2f 4233
1c27024e 4234 for (int i = 0; i < num_sublists; i++) {
ca0bf58d
PS
4235 multilist_sublist_t *mls = multilist_sublist_lock(ml, i);
4236 multilist_sublist_remove(mls, markers[i]);
4237 multilist_sublist_unlock(mls);
34dc7c2f 4238
ca0bf58d 4239 kmem_cache_free(hdr_full_cache, markers[i]);
34dc7c2f 4240 }
ca0bf58d
PS
4241 kmem_free(markers, sizeof (*markers) * num_sublists);
4242
4243 return (total_evicted);
4244}
4245
4246/*
4247 * Flush all "evictable" data of the given type from the arc state
4248 * specified. This will not evict any "active" buffers (i.e. referenced).
4249 *
d3c2ae1c 4250 * When 'retry' is set to B_FALSE, the function will make a single pass
ca0bf58d
PS
4251 * over the state and evict any buffers that it can. Since it doesn't
4252 * continually retry the eviction, it might end up leaving some buffers
4253 * in the ARC due to lock misses.
4254 *
d3c2ae1c 4255 * When 'retry' is set to B_TRUE, the function will continually retry the
ca0bf58d
PS
4256 * eviction until *all* evictable buffers have been removed from the
4257 * state. As a result, if concurrent insertions into the state are
4258 * allowed (e.g. if the ARC isn't shutting down), this function might
4259 * wind up in an infinite loop, continually trying to evict buffers.
4260 */
4261static uint64_t
4262arc_flush_state(arc_state_t *state, uint64_t spa, arc_buf_contents_t type,
4263 boolean_t retry)
4264{
4265 uint64_t evicted = 0;
4266
d3c2ae1c 4267 while (refcount_count(&state->arcs_esize[type]) != 0) {
ca0bf58d
PS
4268 evicted += arc_evict_state(state, spa, ARC_EVICT_ALL, type);
4269
4270 if (!retry)
4271 break;
4272 }
4273
4274 return (evicted);
34dc7c2f
BB
4275}
4276
ab26409d 4277/*
ef5b2e10
BB
4278 * Helper function for arc_prune_async() it is responsible for safely
4279 * handling the execution of a registered arc_prune_func_t.
ab26409d
BB
4280 */
4281static void
f6046738 4282arc_prune_task(void *ptr)
ab26409d 4283{
f6046738
BB
4284 arc_prune_t *ap = (arc_prune_t *)ptr;
4285 arc_prune_func_t *func = ap->p_pfunc;
ab26409d 4286
f6046738
BB
4287 if (func != NULL)
4288 func(ap->p_adjust, ap->p_private);
ab26409d 4289
4442f60d 4290 refcount_remove(&ap->p_refcnt, func);
f6046738 4291}
ab26409d 4292
f6046738
BB
4293/*
4294 * Notify registered consumers they must drop holds on a portion of the ARC
4295 * buffered they reference. This provides a mechanism to ensure the ARC can
4296 * honor the arc_meta_limit and reclaim otherwise pinned ARC buffers. This
4297 * is analogous to dnlc_reduce_cache() but more generic.
4298 *
ef5b2e10 4299 * This operation is performed asynchronously so it may be safely called
ca67b33a 4300 * in the context of the arc_reclaim_thread(). A reference is taken here
f6046738
BB
4301 * for each registered arc_prune_t and the arc_prune_task() is responsible
4302 * for releasing it once the registered arc_prune_func_t has completed.
4303 */
4304static void
4305arc_prune_async(int64_t adjust)
4306{
4307 arc_prune_t *ap;
ab26409d 4308
f6046738
BB
4309 mutex_enter(&arc_prune_mtx);
4310 for (ap = list_head(&arc_prune_list); ap != NULL;
4311 ap = list_next(&arc_prune_list, ap)) {
ab26409d 4312
f6046738
BB
4313 if (refcount_count(&ap->p_refcnt) >= 2)
4314 continue;
ab26409d 4315
f6046738
BB
4316 refcount_add(&ap->p_refcnt, ap->p_pfunc);
4317 ap->p_adjust = adjust;
b60eac3d 4318 if (taskq_dispatch(arc_prune_taskq, arc_prune_task,
48d3eb40 4319 ap, TQ_SLEEP) == TASKQID_INVALID) {
b60eac3d 4320 refcount_remove(&ap->p_refcnt, ap->p_pfunc);
4321 continue;
4322 }
f6046738 4323 ARCSTAT_BUMP(arcstat_prune);
ab26409d 4324 }
ab26409d
BB
4325 mutex_exit(&arc_prune_mtx);
4326}
4327
ca0bf58d
PS
4328/*
4329 * Evict the specified number of bytes from the state specified,
4330 * restricting eviction to the spa and type given. This function
4331 * prevents us from trying to evict more from a state's list than
4332 * is "evictable", and to skip evicting altogether when passed a
4333 * negative value for "bytes". In contrast, arc_evict_state() will
4334 * evict everything it can, when passed a negative value for "bytes".
4335 */
4336static uint64_t
4337arc_adjust_impl(arc_state_t *state, uint64_t spa, int64_t bytes,
4338 arc_buf_contents_t type)
4339{
4340 int64_t delta;
4341
d3c2ae1c
GW
4342 if (bytes > 0 && refcount_count(&state->arcs_esize[type]) > 0) {
4343 delta = MIN(refcount_count(&state->arcs_esize[type]), bytes);
ca0bf58d
PS
4344 return (arc_evict_state(state, spa, delta, type));
4345 }
4346
4347 return (0);
4348}
4349
4350/*
4351 * The goal of this function is to evict enough meta data buffers from the
4352 * ARC in order to enforce the arc_meta_limit. Achieving this is slightly
4353 * more complicated than it appears because it is common for data buffers
4354 * to have holds on meta data buffers. In addition, dnode meta data buffers
4355 * will be held by the dnodes in the block preventing them from being freed.
4356 * This means we can't simply traverse the ARC and expect to always find
4357 * enough unheld meta data buffer to release.
4358 *
4359 * Therefore, this function has been updated to make alternating passes
4360 * over the ARC releasing data buffers and then newly unheld meta data
37fb3e43 4361 * buffers. This ensures forward progress is maintained and meta_used
ca0bf58d
PS
4362 * will decrease. Normally this is sufficient, but if required the ARC
4363 * will call the registered prune callbacks causing dentry and inodes to
4364 * be dropped from the VFS cache. This will make dnode meta data buffers
4365 * available for reclaim.
4366 */
4367static uint64_t
37fb3e43 4368arc_adjust_meta_balanced(uint64_t meta_used)
ca0bf58d 4369{
25e2ab16
TC
4370 int64_t delta, prune = 0, adjustmnt;
4371 uint64_t total_evicted = 0;
ca0bf58d 4372 arc_buf_contents_t type = ARC_BUFC_DATA;
ca67b33a 4373 int restarts = MAX(zfs_arc_meta_adjust_restarts, 0);
ca0bf58d
PS
4374
4375restart:
4376 /*
4377 * This slightly differs than the way we evict from the mru in
4378 * arc_adjust because we don't have a "target" value (i.e. no
4379 * "meta" arc_p). As a result, I think we can completely
4380 * cannibalize the metadata in the MRU before we evict the
4381 * metadata from the MFU. I think we probably need to implement a
4382 * "metadata arc_p" value to do this properly.
4383 */
37fb3e43 4384 adjustmnt = meta_used - arc_meta_limit;
ca0bf58d 4385
d3c2ae1c
GW
4386 if (adjustmnt > 0 && refcount_count(&arc_mru->arcs_esize[type]) > 0) {
4387 delta = MIN(refcount_count(&arc_mru->arcs_esize[type]),
4388 adjustmnt);
ca0bf58d
PS
4389 total_evicted += arc_adjust_impl(arc_mru, 0, delta, type);
4390 adjustmnt -= delta;
4391 }
4392
4393 /*
4394 * We can't afford to recalculate adjustmnt here. If we do,
4395 * new metadata buffers can sneak into the MRU or ANON lists,
4396 * thus penalize the MFU metadata. Although the fudge factor is
4397 * small, it has been empirically shown to be significant for
4398 * certain workloads (e.g. creating many empty directories). As
4399 * such, we use the original calculation for adjustmnt, and
4400 * simply decrement the amount of data evicted from the MRU.
4401 */
4402
d3c2ae1c
GW
4403 if (adjustmnt > 0 && refcount_count(&arc_mfu->arcs_esize[type]) > 0) {
4404 delta = MIN(refcount_count(&arc_mfu->arcs_esize[type]),
4405 adjustmnt);
ca0bf58d
PS
4406 total_evicted += arc_adjust_impl(arc_mfu, 0, delta, type);
4407 }
4408
37fb3e43 4409 adjustmnt = meta_used - arc_meta_limit;
ca0bf58d 4410
d3c2ae1c
GW
4411 if (adjustmnt > 0 &&
4412 refcount_count(&arc_mru_ghost->arcs_esize[type]) > 0) {
ca0bf58d 4413 delta = MIN(adjustmnt,
d3c2ae1c 4414 refcount_count(&arc_mru_ghost->arcs_esize[type]));
ca0bf58d
PS
4415 total_evicted += arc_adjust_impl(arc_mru_ghost, 0, delta, type);
4416 adjustmnt -= delta;
4417 }
4418
d3c2ae1c
GW
4419 if (adjustmnt > 0 &&
4420 refcount_count(&arc_mfu_ghost->arcs_esize[type]) > 0) {
ca0bf58d 4421 delta = MIN(adjustmnt,
d3c2ae1c 4422 refcount_count(&arc_mfu_ghost->arcs_esize[type]));
ca0bf58d
PS
4423 total_evicted += arc_adjust_impl(arc_mfu_ghost, 0, delta, type);
4424 }
4425
4426 /*
4427 * If after attempting to make the requested adjustment to the ARC
4428 * the meta limit is still being exceeded then request that the
4429 * higher layers drop some cached objects which have holds on ARC
4430 * meta buffers. Requests to the upper layers will be made with
4431 * increasingly large scan sizes until the ARC is below the limit.
4432 */
37fb3e43 4433 if (meta_used > arc_meta_limit) {
ca0bf58d
PS
4434 if (type == ARC_BUFC_DATA) {
4435 type = ARC_BUFC_METADATA;
4436 } else {
4437 type = ARC_BUFC_DATA;
4438
4439 if (zfs_arc_meta_prune) {
4440 prune += zfs_arc_meta_prune;
f6046738 4441 arc_prune_async(prune);
ca0bf58d
PS
4442 }
4443 }
4444
4445 if (restarts > 0) {
4446 restarts--;
4447 goto restart;
4448 }
4449 }
4450 return (total_evicted);
4451}
4452
f6046738
BB
4453/*
4454 * Evict metadata buffers from the cache, such that arc_meta_used is
4455 * capped by the arc_meta_limit tunable.
4456 */
4457static uint64_t
37fb3e43 4458arc_adjust_meta_only(uint64_t meta_used)
f6046738
BB
4459{
4460 uint64_t total_evicted = 0;
4461 int64_t target;
4462
4463 /*
4464 * If we're over the meta limit, we want to evict enough
4465 * metadata to get back under the meta limit. We don't want to
4466 * evict so much that we drop the MRU below arc_p, though. If
4467 * we're over the meta limit more than we're over arc_p, we
4468 * evict some from the MRU here, and some from the MFU below.
4469 */
37fb3e43 4470 target = MIN((int64_t)(meta_used - arc_meta_limit),
36da08ef
PS
4471 (int64_t)(refcount_count(&arc_anon->arcs_size) +
4472 refcount_count(&arc_mru->arcs_size) - arc_p));
f6046738
BB
4473
4474 total_evicted += arc_adjust_impl(arc_mru, 0, target, ARC_BUFC_METADATA);
4475
4476 /*
4477 * Similar to the above, we want to evict enough bytes to get us
4478 * below the meta limit, but not so much as to drop us below the
2aa34383 4479 * space allotted to the MFU (which is defined as arc_c - arc_p).
f6046738 4480 */
37fb3e43
PD
4481 target = MIN((int64_t)(meta_used - arc_meta_limit),
4482 (int64_t)(refcount_count(&arc_mfu->arcs_size) -
4483 (arc_c - arc_p)));
f6046738
BB
4484
4485 total_evicted += arc_adjust_impl(arc_mfu, 0, target, ARC_BUFC_METADATA);
4486
4487 return (total_evicted);
4488}
4489
4490static uint64_t
37fb3e43 4491arc_adjust_meta(uint64_t meta_used)
f6046738
BB
4492{
4493 if (zfs_arc_meta_strategy == ARC_STRATEGY_META_ONLY)
37fb3e43 4494 return (arc_adjust_meta_only(meta_used));
f6046738 4495 else
37fb3e43 4496 return (arc_adjust_meta_balanced(meta_used));
f6046738
BB
4497}
4498
ca0bf58d
PS
4499/*
4500 * Return the type of the oldest buffer in the given arc state
4501 *
4502 * This function will select a random sublist of type ARC_BUFC_DATA and
4503 * a random sublist of type ARC_BUFC_METADATA. The tail of each sublist
4504 * is compared, and the type which contains the "older" buffer will be
4505 * returned.
4506 */
4507static arc_buf_contents_t
4508arc_adjust_type(arc_state_t *state)
4509{
64fc7762
MA
4510 multilist_t *data_ml = state->arcs_list[ARC_BUFC_DATA];
4511 multilist_t *meta_ml = state->arcs_list[ARC_BUFC_METADATA];
ca0bf58d
PS
4512 int data_idx = multilist_get_random_index(data_ml);
4513 int meta_idx = multilist_get_random_index(meta_ml);
4514 multilist_sublist_t *data_mls;
4515 multilist_sublist_t *meta_mls;
4516 arc_buf_contents_t type;
4517 arc_buf_hdr_t *data_hdr;
4518 arc_buf_hdr_t *meta_hdr;
4519
4520 /*
4521 * We keep the sublist lock until we're finished, to prevent
4522 * the headers from being destroyed via arc_evict_state().
4523 */
4524 data_mls = multilist_sublist_lock(data_ml, data_idx);
4525 meta_mls = multilist_sublist_lock(meta_ml, meta_idx);
4526
4527 /*
4528 * These two loops are to ensure we skip any markers that
4529 * might be at the tail of the lists due to arc_evict_state().
4530 */
4531
4532 for (data_hdr = multilist_sublist_tail(data_mls); data_hdr != NULL;
4533 data_hdr = multilist_sublist_prev(data_mls, data_hdr)) {
4534 if (data_hdr->b_spa != 0)
4535 break;
4536 }
4537
4538 for (meta_hdr = multilist_sublist_tail(meta_mls); meta_hdr != NULL;
4539 meta_hdr = multilist_sublist_prev(meta_mls, meta_hdr)) {
4540 if (meta_hdr->b_spa != 0)
4541 break;
4542 }
4543
4544 if (data_hdr == NULL && meta_hdr == NULL) {
4545 type = ARC_BUFC_DATA;
4546 } else if (data_hdr == NULL) {
4547 ASSERT3P(meta_hdr, !=, NULL);
4548 type = ARC_BUFC_METADATA;
4549 } else if (meta_hdr == NULL) {
4550 ASSERT3P(data_hdr, !=, NULL);
4551 type = ARC_BUFC_DATA;
4552 } else {
4553 ASSERT3P(data_hdr, !=, NULL);
4554 ASSERT3P(meta_hdr, !=, NULL);
4555
4556 /* The headers can't be on the sublist without an L1 header */
4557 ASSERT(HDR_HAS_L1HDR(data_hdr));
4558 ASSERT(HDR_HAS_L1HDR(meta_hdr));
4559
4560 if (data_hdr->b_l1hdr.b_arc_access <
4561 meta_hdr->b_l1hdr.b_arc_access) {
4562 type = ARC_BUFC_DATA;
4563 } else {
4564 type = ARC_BUFC_METADATA;
4565 }
4566 }
4567
4568 multilist_sublist_unlock(meta_mls);
4569 multilist_sublist_unlock(data_mls);
4570
4571 return (type);
4572}
4573
4574/*
4575 * Evict buffers from the cache, such that arc_size is capped by arc_c.
4576 */
4577static uint64_t
4578arc_adjust(void)
4579{
4580 uint64_t total_evicted = 0;
4581 uint64_t bytes;
4582 int64_t target;
37fb3e43
PD
4583 uint64_t asize = aggsum_value(&arc_size);
4584 uint64_t ameta = aggsum_value(&arc_meta_used);
ca0bf58d
PS
4585
4586 /*
4587 * If we're over arc_meta_limit, we want to correct that before
4588 * potentially evicting data buffers below.
4589 */
37fb3e43 4590 total_evicted += arc_adjust_meta(ameta);
ca0bf58d
PS
4591
4592 /*
4593 * Adjust MRU size
4594 *
4595 * If we're over the target cache size, we want to evict enough
4596 * from the list to get back to our target size. We don't want
4597 * to evict too much from the MRU, such that it drops below
4598 * arc_p. So, if we're over our target cache size more than
4599 * the MRU is over arc_p, we'll evict enough to get back to
4600 * arc_p here, and then evict more from the MFU below.
4601 */
37fb3e43 4602 target = MIN((int64_t)(asize - arc_c),
36da08ef 4603 (int64_t)(refcount_count(&arc_anon->arcs_size) +
37fb3e43 4604 refcount_count(&arc_mru->arcs_size) + ameta - arc_p));
ca0bf58d
PS
4605
4606 /*
4607 * If we're below arc_meta_min, always prefer to evict data.
4608 * Otherwise, try to satisfy the requested number of bytes to
4609 * evict from the type which contains older buffers; in an
4610 * effort to keep newer buffers in the cache regardless of their
4611 * type. If we cannot satisfy the number of bytes from this
4612 * type, spill over into the next type.
4613 */
4614 if (arc_adjust_type(arc_mru) == ARC_BUFC_METADATA &&
37fb3e43 4615 ameta > arc_meta_min) {
ca0bf58d
PS
4616 bytes = arc_adjust_impl(arc_mru, 0, target, ARC_BUFC_METADATA);
4617 total_evicted += bytes;
4618
4619 /*
4620 * If we couldn't evict our target number of bytes from
4621 * metadata, we try to get the rest from data.
4622 */
4623 target -= bytes;
4624
4625 total_evicted +=
4626 arc_adjust_impl(arc_mru, 0, target, ARC_BUFC_DATA);
4627 } else {
4628 bytes = arc_adjust_impl(arc_mru, 0, target, ARC_BUFC_DATA);
4629 total_evicted += bytes;
4630
4631 /*
4632 * If we couldn't evict our target number of bytes from
4633 * data, we try to get the rest from metadata.
4634 */
4635 target -= bytes;
4636
4637 total_evicted +=
4638 arc_adjust_impl(arc_mru, 0, target, ARC_BUFC_METADATA);
4639 }
4640
0405eeea
RE
4641 /*
4642 * Re-sum ARC stats after the first round of evictions.
4643 */
4644 asize = aggsum_value(&arc_size);
4645 ameta = aggsum_value(&arc_meta_used);
4646
4647
ca0bf58d
PS
4648 /*
4649 * Adjust MFU size
4650 *
4651 * Now that we've tried to evict enough from the MRU to get its
4652 * size back to arc_p, if we're still above the target cache
4653 * size, we evict the rest from the MFU.
4654 */
37fb3e43 4655 target = asize - arc_c;
ca0bf58d 4656
a7b10a93 4657 if (arc_adjust_type(arc_mfu) == ARC_BUFC_METADATA &&
37fb3e43 4658 ameta > arc_meta_min) {
ca0bf58d
PS
4659 bytes = arc_adjust_impl(arc_mfu, 0, target, ARC_BUFC_METADATA);
4660 total_evicted += bytes;
4661
4662 /*
4663 * If we couldn't evict our target number of bytes from
4664 * metadata, we try to get the rest from data.
4665 */
4666 target -= bytes;
4667
4668 total_evicted +=
4669 arc_adjust_impl(arc_mfu, 0, target, ARC_BUFC_DATA);
4670 } else {
4671 bytes = arc_adjust_impl(arc_mfu, 0, target, ARC_BUFC_DATA);
4672 total_evicted += bytes;
4673
4674 /*
4675 * If we couldn't evict our target number of bytes from
4676 * data, we try to get the rest from data.
4677 */
4678 target -= bytes;
4679
4680 total_evicted +=
4681 arc_adjust_impl(arc_mfu, 0, target, ARC_BUFC_METADATA);
4682 }
4683
4684 /*
4685 * Adjust ghost lists
4686 *
4687 * In addition to the above, the ARC also defines target values
4688 * for the ghost lists. The sum of the mru list and mru ghost
4689 * list should never exceed the target size of the cache, and
4690 * the sum of the mru list, mfu list, mru ghost list, and mfu
4691 * ghost list should never exceed twice the target size of the
4692 * cache. The following logic enforces these limits on the ghost
4693 * caches, and evicts from them as needed.
4694 */
36da08ef
PS
4695 target = refcount_count(&arc_mru->arcs_size) +
4696 refcount_count(&arc_mru_ghost->arcs_size) - arc_c;
ca0bf58d
PS
4697
4698 bytes = arc_adjust_impl(arc_mru_ghost, 0, target, ARC_BUFC_DATA);
4699 total_evicted += bytes;
4700
4701 target -= bytes;
4702
4703 total_evicted +=
4704 arc_adjust_impl(arc_mru_ghost, 0, target, ARC_BUFC_METADATA);
4705
4706 /*
4707 * We assume the sum of the mru list and mfu list is less than
4708 * or equal to arc_c (we enforced this above), which means we
4709 * can use the simpler of the two equations below:
4710 *
4711 * mru + mfu + mru ghost + mfu ghost <= 2 * arc_c
4712 * mru ghost + mfu ghost <= arc_c
4713 */
36da08ef
PS
4714 target = refcount_count(&arc_mru_ghost->arcs_size) +
4715 refcount_count(&arc_mfu_ghost->arcs_size) - arc_c;
ca0bf58d
PS
4716
4717 bytes = arc_adjust_impl(arc_mfu_ghost, 0, target, ARC_BUFC_DATA);
4718 total_evicted += bytes;
4719
4720 target -= bytes;
4721
4722 total_evicted +=
4723 arc_adjust_impl(arc_mfu_ghost, 0, target, ARC_BUFC_METADATA);
4724
4725 return (total_evicted);
4726}
4727
ca0bf58d
PS
4728void
4729arc_flush(spa_t *spa, boolean_t retry)
ab26409d 4730{
ca0bf58d 4731 uint64_t guid = 0;
94520ca4 4732
bc888666 4733 /*
d3c2ae1c 4734 * If retry is B_TRUE, a spa must not be specified since we have
ca0bf58d
PS
4735 * no good way to determine if all of a spa's buffers have been
4736 * evicted from an arc state.
bc888666 4737 */
ca0bf58d 4738 ASSERT(!retry || spa == 0);
d164b209 4739
b9541d6b 4740 if (spa != NULL)
3541dc6d 4741 guid = spa_load_guid(spa);
d164b209 4742
ca0bf58d
PS
4743 (void) arc_flush_state(arc_mru, guid, ARC_BUFC_DATA, retry);
4744 (void) arc_flush_state(arc_mru, guid, ARC_BUFC_METADATA, retry);
4745
4746 (void) arc_flush_state(arc_mfu, guid, ARC_BUFC_DATA, retry);
4747 (void) arc_flush_state(arc_mfu, guid, ARC_BUFC_METADATA, retry);
4748
4749 (void) arc_flush_state(arc_mru_ghost, guid, ARC_BUFC_DATA, retry);
4750 (void) arc_flush_state(arc_mru_ghost, guid, ARC_BUFC_METADATA, retry);
34dc7c2f 4751
ca0bf58d
PS
4752 (void) arc_flush_state(arc_mfu_ghost, guid, ARC_BUFC_DATA, retry);
4753 (void) arc_flush_state(arc_mfu_ghost, guid, ARC_BUFC_METADATA, retry);
34dc7c2f
BB
4754}
4755
34dc7c2f 4756void
ca67b33a 4757arc_shrink(int64_t to_free)
34dc7c2f 4758{
37fb3e43 4759 uint64_t asize = aggsum_value(&arc_size);
1b8951b3 4760 uint64_t c = arc_c;
34dc7c2f 4761
1b8951b3
TC
4762 if (c > to_free && c - to_free > arc_c_min) {
4763 arc_c = c - to_free;
ca67b33a 4764 atomic_add_64(&arc_p, -(arc_p >> arc_shrink_shift));
37fb3e43
PD
4765 if (asize < arc_c)
4766 arc_c = MAX(asize, arc_c_min);
34dc7c2f
BB
4767 if (arc_p > arc_c)
4768 arc_p = (arc_c >> 1);
4769 ASSERT(arc_c >= arc_c_min);
4770 ASSERT((int64_t)arc_p >= 0);
1b8951b3
TC
4771 } else {
4772 arc_c = arc_c_min;
34dc7c2f
BB
4773 }
4774
37fb3e43 4775 if (asize > arc_c)
ca0bf58d 4776 (void) arc_adjust();
34dc7c2f
BB
4777}
4778
9edb3695
BB
4779/*
4780 * Return maximum amount of memory that we could possibly use. Reduced
4781 * to half of all memory in user space which is primarily used for testing.
4782 */
4783static uint64_t
4784arc_all_memory(void)
4785{
4786#ifdef _KERNEL
70f02287
BB
4787#ifdef CONFIG_HIGHMEM
4788 return (ptob(totalram_pages - totalhigh_pages));
4789#else
4790 return (ptob(totalram_pages));
4791#endif /* CONFIG_HIGHMEM */
9edb3695
BB
4792#else
4793 return (ptob(physmem) / 2);
70f02287 4794#endif /* _KERNEL */
9edb3695
BB
4795}
4796
70f02287
BB
4797/*
4798 * Return the amount of memory that is considered free. In user space
4799 * which is primarily used for testing we pretend that free memory ranges
4800 * from 0-20% of all memory.
4801 */
787acae0
GDN
4802static uint64_t
4803arc_free_memory(void)
4804{
70f02287
BB
4805#ifdef _KERNEL
4806#ifdef CONFIG_HIGHMEM
4807 struct sysinfo si;
4808 si_meminfo(&si);
4809 return (ptob(si.freeram - si.freehigh));
4810#else
70f02287 4811 return (ptob(nr_free_pages() +
e9a77290 4812 nr_inactive_file_pages() +
4813 nr_inactive_anon_pages() +
4814 nr_slab_reclaimable_pages()));
4815
70f02287
BB
4816#endif /* CONFIG_HIGHMEM */
4817#else
4818 return (spa_get_random(arc_all_memory() * 20 / 100));
4819#endif /* _KERNEL */
787acae0 4820}
787acae0 4821
ca67b33a
MA
4822typedef enum free_memory_reason_t {
4823 FMR_UNKNOWN,
4824 FMR_NEEDFREE,
4825 FMR_LOTSFREE,
4826 FMR_SWAPFS_MINFREE,
4827 FMR_PAGES_PP_MAXIMUM,
4828 FMR_HEAP_ARENA,
4829 FMR_ZIO_ARENA,
4830} free_memory_reason_t;
4831
4832int64_t last_free_memory;
4833free_memory_reason_t last_free_reason;
4834
4835#ifdef _KERNEL
ca67b33a
MA
4836/*
4837 * Additional reserve of pages for pp_reserve.
4838 */
4839int64_t arc_pages_pp_reserve = 64;
4840
4841/*
4842 * Additional reserve of pages for swapfs.
4843 */
4844int64_t arc_swapfs_reserve = 64;
ca67b33a
MA
4845#endif /* _KERNEL */
4846
4847/*
4848 * Return the amount of memory that can be consumed before reclaim will be
4849 * needed. Positive if there is sufficient free memory, negative indicates
4850 * the amount of memory that needs to be freed up.
4851 */
4852static int64_t
4853arc_available_memory(void)
4854{
4855 int64_t lowest = INT64_MAX;
4856 free_memory_reason_t r = FMR_UNKNOWN;
ca67b33a 4857#ifdef _KERNEL
ca67b33a 4858 int64_t n;
11f552fa 4859#ifdef __linux__
70f02287
BB
4860#ifdef freemem
4861#undef freemem
4862#endif
11f552fa
BB
4863 pgcnt_t needfree = btop(arc_need_free);
4864 pgcnt_t lotsfree = btop(arc_sys_free);
4865 pgcnt_t desfree = 0;
70f02287 4866 pgcnt_t freemem = btop(arc_free_memory());
9edb3695
BB
4867#endif
4868
ca67b33a
MA
4869 if (needfree > 0) {
4870 n = PAGESIZE * (-needfree);
4871 if (n < lowest) {
4872 lowest = n;
4873 r = FMR_NEEDFREE;
4874 }
4875 }
4876
4877 /*
4878 * check that we're out of range of the pageout scanner. It starts to
4879 * schedule paging if freemem is less than lotsfree and needfree.
4880 * lotsfree is the high-water mark for pageout, and needfree is the
4881 * number of needed free pages. We add extra pages here to make sure
4882 * the scanner doesn't start up while we're freeing memory.
4883 */
70f02287 4884 n = PAGESIZE * (freemem - lotsfree - needfree - desfree);
ca67b33a
MA
4885 if (n < lowest) {
4886 lowest = n;
4887 r = FMR_LOTSFREE;
4888 }
4889
11f552fa 4890#ifndef __linux__
ca67b33a
MA
4891 /*
4892 * check to make sure that swapfs has enough space so that anon
4893 * reservations can still succeed. anon_resvmem() checks that the
4894 * availrmem is greater than swapfs_minfree, and the number of reserved
4895 * swap pages. We also add a bit of extra here just to prevent
4896 * circumstances from getting really dire.
4897 */
4898 n = PAGESIZE * (availrmem - swapfs_minfree - swapfs_reserve -
4899 desfree - arc_swapfs_reserve);
4900 if (n < lowest) {
4901 lowest = n;
4902 r = FMR_SWAPFS_MINFREE;
4903 }
4904
ca67b33a
MA
4905 /*
4906 * Check that we have enough availrmem that memory locking (e.g., via
4907 * mlock(3C) or memcntl(2)) can still succeed. (pages_pp_maximum
4908 * stores the number of pages that cannot be locked; when availrmem
4909 * drops below pages_pp_maximum, page locking mechanisms such as
4910 * page_pp_lock() will fail.)
4911 */
4912 n = PAGESIZE * (availrmem - pages_pp_maximum -
4913 arc_pages_pp_reserve);
4914 if (n < lowest) {
4915 lowest = n;
4916 r = FMR_PAGES_PP_MAXIMUM;
4917 }
11f552fa 4918#endif
ca67b33a 4919
70f02287 4920#if defined(_ILP32)
ca67b33a 4921 /*
70f02287 4922 * If we're on a 32-bit platform, it's possible that we'll exhaust the
ca67b33a
MA
4923 * kernel heap space before we ever run out of available physical
4924 * memory. Most checks of the size of the heap_area compare against
4925 * tune.t_minarmem, which is the minimum available real memory that we
4926 * can have in the system. However, this is generally fixed at 25 pages
4927 * which is so low that it's useless. In this comparison, we seek to
4928 * calculate the total heap-size, and reclaim if more than 3/4ths of the
4929 * heap is allocated. (Or, in the calculation, if less than 1/4th is
4930 * free)
4931 */
4932 n = vmem_size(heap_arena, VMEM_FREE) -
4933 (vmem_size(heap_arena, VMEM_FREE | VMEM_ALLOC) >> 2);
4934 if (n < lowest) {
4935 lowest = n;
4936 r = FMR_HEAP_ARENA;
4937 }
4938#endif
4939
4940 /*
4941 * If zio data pages are being allocated out of a separate heap segment,
4942 * then enforce that the size of available vmem for this arena remains
d3c2ae1c 4943 * above about 1/4th (1/(2^arc_zio_arena_free_shift)) free.
ca67b33a 4944 *
d3c2ae1c
GW
4945 * Note that reducing the arc_zio_arena_free_shift keeps more virtual
4946 * memory (in the zio_arena) free, which can avoid memory
4947 * fragmentation issues.
ca67b33a
MA
4948 */
4949 if (zio_arena != NULL) {
9edb3695
BB
4950 n = (int64_t)vmem_size(zio_arena, VMEM_FREE) -
4951 (vmem_size(zio_arena, VMEM_ALLOC) >>
4952 arc_zio_arena_free_shift);
ca67b33a
MA
4953 if (n < lowest) {
4954 lowest = n;
4955 r = FMR_ZIO_ARENA;
4956 }
4957 }
11f552fa 4958#else /* _KERNEL */
ca67b33a
MA
4959 /* Every 100 calls, free a small amount */
4960 if (spa_get_random(100) == 0)
4961 lowest = -1024;
11f552fa 4962#endif /* _KERNEL */
ca67b33a
MA
4963
4964 last_free_memory = lowest;
4965 last_free_reason = r;
4966
4967 return (lowest);
4968}
4969
4970/*
4971 * Determine if the system is under memory pressure and is asking
d3c2ae1c 4972 * to reclaim memory. A return value of B_TRUE indicates that the system
ca67b33a
MA
4973 * is under memory pressure and that the arc should adjust accordingly.
4974 */
4975static boolean_t
4976arc_reclaim_needed(void)
4977{
4978 return (arc_available_memory() < 0);
4979}
4980
34dc7c2f 4981static void
ca67b33a 4982arc_kmem_reap_now(void)
34dc7c2f
BB
4983{
4984 size_t i;
4985 kmem_cache_t *prev_cache = NULL;
4986 kmem_cache_t *prev_data_cache = NULL;
4987 extern kmem_cache_t *zio_buf_cache[];
4988 extern kmem_cache_t *zio_data_buf_cache[];
669dedb3 4989 extern kmem_cache_t *range_seg_cache;
34dc7c2f 4990
70f02287 4991#ifdef _KERNEL
37fb3e43
PD
4992 if ((aggsum_compare(&arc_meta_used, arc_meta_limit) >= 0) &&
4993 zfs_arc_meta_prune) {
f6046738
BB
4994 /*
4995 * We are exceeding our meta-data cache limit.
4996 * Prune some entries to release holds on meta-data.
4997 */
ef5b2e10 4998 arc_prune_async(zfs_arc_meta_prune);
f6046738 4999 }
70f02287
BB
5000#if defined(_ILP32)
5001 /*
5002 * Reclaim unused memory from all kmem caches.
5003 */
5004 kmem_reap();
5005#endif
5006#endif
f6046738 5007
34dc7c2f 5008 for (i = 0; i < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; i++) {
70f02287 5009#if defined(_ILP32)
d0c614ec 5010 /* reach upper limit of cache size on 32-bit */
5011 if (zio_buf_cache[i] == NULL)
5012 break;
5013#endif
34dc7c2f
BB
5014 if (zio_buf_cache[i] != prev_cache) {
5015 prev_cache = zio_buf_cache[i];
5016 kmem_cache_reap_now(zio_buf_cache[i]);
5017 }
5018 if (zio_data_buf_cache[i] != prev_data_cache) {
5019 prev_data_cache = zio_data_buf_cache[i];
5020 kmem_cache_reap_now(zio_data_buf_cache[i]);
5021 }
5022 }
ca0bf58d 5023 kmem_cache_reap_now(buf_cache);
b9541d6b
CW
5024 kmem_cache_reap_now(hdr_full_cache);
5025 kmem_cache_reap_now(hdr_l2only_cache);
669dedb3 5026 kmem_cache_reap_now(range_seg_cache);
ca67b33a
MA
5027
5028 if (zio_arena != NULL) {
5029 /*
5030 * Ask the vmem arena to reclaim unused memory from its
5031 * quantum caches.
5032 */
5033 vmem_qcache_reap(zio_arena);
5034 }
34dc7c2f
BB
5035}
5036
302f753f 5037/*
a6255b7f 5038 * Threads can block in arc_get_data_impl() waiting for this thread to evict
ca0bf58d 5039 * enough data and signal them to proceed. When this happens, the threads in
a6255b7f 5040 * arc_get_data_impl() are sleeping while holding the hash lock for their
ca0bf58d
PS
5041 * particular arc header. Thus, we must be careful to never sleep on a
5042 * hash lock in this thread. This is to prevent the following deadlock:
5043 *
a6255b7f 5044 * - Thread A sleeps on CV in arc_get_data_impl() holding hash lock "L",
ca0bf58d
PS
5045 * waiting for the reclaim thread to signal it.
5046 *
5047 * - arc_reclaim_thread() tries to acquire hash lock "L" using mutex_enter,
5048 * fails, and goes to sleep forever.
5049 *
5050 * This possible deadlock is avoided by always acquiring a hash lock
5051 * using mutex_tryenter() from arc_reclaim_thread().
302f753f 5052 */
867959b5 5053/* ARGSUSED */
34dc7c2f 5054static void
c25b8f99 5055arc_reclaim_thread(void *unused)
34dc7c2f 5056{
ca67b33a 5057 fstrans_cookie_t cookie = spl_fstrans_mark();
ae6d0c60 5058 hrtime_t growtime = 0;
34dc7c2f
BB
5059 callb_cpr_t cpr;
5060
ca0bf58d 5061 CALLB_CPR_INIT(&cpr, &arc_reclaim_lock, callb_generic_cpr, FTAG);
34dc7c2f 5062
ca0bf58d 5063 mutex_enter(&arc_reclaim_lock);
ca67b33a 5064 while (!arc_reclaim_thread_exit) {
ca67b33a 5065 uint64_t evicted = 0;
30fffb90 5066 uint64_t need_free = arc_need_free;
ca67b33a 5067 arc_tuning_update();
34dc7c2f 5068
d3c2ae1c
GW
5069 /*
5070 * This is necessary in order for the mdb ::arc dcmd to
5071 * show up to date information. Since the ::arc command
5072 * does not call the kstat's update function, without
5073 * this call, the command may show stale stats for the
5074 * anon, mru, mru_ghost, mfu, and mfu_ghost lists. Even
5075 * with this change, the data might be up to 1 second
5076 * out of date; but that should suffice. The arc_state_t
5077 * structures can be queried directly if more accurate
5078 * information is needed.
5079 */
5080#ifndef __linux__
5081 if (arc_ksp != NULL)
5082 arc_ksp->ks_update(arc_ksp, KSTAT_READ);
5083#endif
ca67b33a 5084 mutex_exit(&arc_reclaim_lock);
34dc7c2f 5085
0a252dae
GM
5086 /*
5087 * We call arc_adjust() before (possibly) calling
5088 * arc_kmem_reap_now(), so that we can wake up
5089 * arc_get_data_buf() sooner.
5090 */
5091 evicted = arc_adjust();
5092
5093 int64_t free_memory = arc_available_memory();
ca67b33a 5094 if (free_memory < 0) {
34dc7c2f 5095
ca67b33a 5096 arc_no_grow = B_TRUE;
b128c09f 5097 arc_warm = B_TRUE;
34dc7c2f 5098
ca67b33a
MA
5099 /*
5100 * Wait at least zfs_grow_retry (default 5) seconds
5101 * before considering growing.
5102 */
ae6d0c60 5103 growtime = gethrtime() + SEC2NSEC(arc_grow_retry);
6a8f9b6b 5104
ca67b33a 5105 arc_kmem_reap_now();
34dc7c2f 5106
ca67b33a
MA
5107 /*
5108 * If we are still low on memory, shrink the ARC
5109 * so that we have arc_shrink_min free space.
5110 */
5111 free_memory = arc_available_memory();
34dc7c2f 5112
1c27024e
DB
5113 int64_t to_free =
5114 (arc_c >> arc_shrink_shift) - free_memory;
ca67b33a
MA
5115 if (to_free > 0) {
5116#ifdef _KERNEL
30fffb90 5117 to_free = MAX(to_free, need_free);
ca67b33a
MA
5118#endif
5119 arc_shrink(to_free);
5120 }
5121 } else if (free_memory < arc_c >> arc_no_grow_shift) {
5122 arc_no_grow = B_TRUE;
ae6d0c60 5123 } else if (gethrtime() >= growtime) {
ca67b33a
MA
5124 arc_no_grow = B_FALSE;
5125 }
bce45ec9 5126
ca67b33a 5127 mutex_enter(&arc_reclaim_lock);
bce45ec9 5128
ca67b33a
MA
5129 /*
5130 * If evicted is zero, we couldn't evict anything via
5131 * arc_adjust(). This could be due to hash lock
5132 * collisions, but more likely due to the majority of
5133 * arc buffers being unevictable. Therefore, even if
5134 * arc_size is above arc_c, another pass is unlikely to
5135 * be helpful and could potentially cause us to enter an
5136 * infinite loop.
5137 */
37fb3e43 5138 if (aggsum_compare(&arc_size, arc_c) <= 0|| evicted == 0) {
ca67b33a
MA
5139 /*
5140 * We're either no longer overflowing, or we
5141 * can't evict anything more, so we should wake
30fffb90
DB
5142 * up any threads before we go to sleep and remove
5143 * the bytes we were working on from arc_need_free
5144 * since nothing more will be done here.
ca67b33a
MA
5145 */
5146 cv_broadcast(&arc_reclaim_waiters_cv);
30fffb90 5147 ARCSTAT_INCR(arcstat_need_free, -need_free);
bce45ec9 5148
ca67b33a
MA
5149 /*
5150 * Block until signaled, or after one second (we
5151 * might need to perform arc_kmem_reap_now()
5152 * even if we aren't being signalled)
5153 */
5154 CALLB_CPR_SAFE_BEGIN(&cpr);
a9bb2b68 5155 (void) cv_timedwait_sig_hires(&arc_reclaim_thread_cv,
ae6d0c60 5156 &arc_reclaim_lock, SEC2NSEC(1), MSEC2NSEC(1), 0);
ca67b33a
MA
5157 CALLB_CPR_SAFE_END(&cpr, &arc_reclaim_lock);
5158 }
ca0bf58d 5159 }
bce45ec9 5160
d3c2ae1c 5161 arc_reclaim_thread_exit = B_FALSE;
ca0bf58d
PS
5162 cv_broadcast(&arc_reclaim_thread_cv);
5163 CALLB_CPR_EXIT(&cpr); /* drops arc_reclaim_lock */
5164 spl_fstrans_unmark(cookie);
5165 thread_exit();
5166}
5167
7cb67b45
BB
5168#ifdef _KERNEL
5169/*
302f753f
BB
5170 * Determine the amount of memory eligible for eviction contained in the
5171 * ARC. All clean data reported by the ghost lists can always be safely
5172 * evicted. Due to arc_c_min, the same does not hold for all clean data
5173 * contained by the regular mru and mfu lists.
5174 *
5175 * In the case of the regular mru and mfu lists, we need to report as
5176 * much clean data as possible, such that evicting that same reported
5177 * data will not bring arc_size below arc_c_min. Thus, in certain
5178 * circumstances, the total amount of clean data in the mru and mfu
5179 * lists might not actually be evictable.
5180 *
5181 * The following two distinct cases are accounted for:
5182 *
5183 * 1. The sum of the amount of dirty data contained by both the mru and
5184 * mfu lists, plus the ARC's other accounting (e.g. the anon list),
5185 * is greater than or equal to arc_c_min.
5186 * (i.e. amount of dirty data >= arc_c_min)
5187 *
5188 * This is the easy case; all clean data contained by the mru and mfu
5189 * lists is evictable. Evicting all clean data can only drop arc_size
5190 * to the amount of dirty data, which is greater than arc_c_min.
5191 *
5192 * 2. The sum of the amount of dirty data contained by both the mru and
5193 * mfu lists, plus the ARC's other accounting (e.g. the anon list),
5194 * is less than arc_c_min.
5195 * (i.e. arc_c_min > amount of dirty data)
5196 *
5197 * 2.1. arc_size is greater than or equal arc_c_min.
5198 * (i.e. arc_size >= arc_c_min > amount of dirty data)
5199 *
5200 * In this case, not all clean data from the regular mru and mfu
5201 * lists is actually evictable; we must leave enough clean data
5202 * to keep arc_size above arc_c_min. Thus, the maximum amount of
5203 * evictable data from the two lists combined, is exactly the
5204 * difference between arc_size and arc_c_min.
5205 *
5206 * 2.2. arc_size is less than arc_c_min
5207 * (i.e. arc_c_min > arc_size > amount of dirty data)
5208 *
5209 * In this case, none of the data contained in the mru and mfu
5210 * lists is evictable, even if it's clean. Since arc_size is
5211 * already below arc_c_min, evicting any more would only
5212 * increase this negative difference.
7cb67b45 5213 */
302f753f 5214static uint64_t
4ea3f864
GM
5215arc_evictable_memory(void)
5216{
37fb3e43 5217 int64_t asize = aggsum_value(&arc_size);
302f753f 5218 uint64_t arc_clean =
d3c2ae1c
GW
5219 refcount_count(&arc_mru->arcs_esize[ARC_BUFC_DATA]) +
5220 refcount_count(&arc_mru->arcs_esize[ARC_BUFC_METADATA]) +
5221 refcount_count(&arc_mfu->arcs_esize[ARC_BUFC_DATA]) +
5222 refcount_count(&arc_mfu->arcs_esize[ARC_BUFC_METADATA]);
37fb3e43 5223 uint64_t arc_dirty = MAX((int64_t)asize - (int64_t)arc_clean, 0);
302f753f 5224
03b60eee
DB
5225 /*
5226 * Scale reported evictable memory in proportion to page cache, cap
5227 * at specified min/max.
5228 */
e9a77290 5229 uint64_t min = (ptob(nr_file_pages()) / 100) * zfs_arc_pc_percent;
03b60eee
DB
5230 min = MAX(arc_c_min, MIN(arc_c_max, min));
5231
5232 if (arc_dirty >= min)
9b50146d 5233 return (arc_clean);
302f753f 5234
37fb3e43 5235 return (MAX((int64_t)asize - (int64_t)min, 0));
302f753f
BB
5236}
5237
ed6e9cc2
TC
5238/*
5239 * If sc->nr_to_scan is zero, the caller is requesting a query of the
5240 * number of objects which can potentially be freed. If it is nonzero,
5241 * the request is to free that many objects.
5242 *
5243 * Linux kernels >= 3.12 have the count_objects and scan_objects callbacks
5244 * in struct shrinker and also require the shrinker to return the number
5245 * of objects freed.
5246 *
5247 * Older kernels require the shrinker to return the number of freeable
5248 * objects following the freeing of nr_to_free.
5249 */
5250static spl_shrinker_t
7e7baeca 5251__arc_shrinker_func(struct shrinker *shrink, struct shrink_control *sc)
7cb67b45 5252{
ed6e9cc2 5253 int64_t pages;
7cb67b45 5254
302f753f
BB
5255 /* The arc is considered warm once reclaim has occurred */
5256 if (unlikely(arc_warm == B_FALSE))
5257 arc_warm = B_TRUE;
7cb67b45 5258
302f753f 5259 /* Return the potential number of reclaimable pages */
ed6e9cc2 5260 pages = btop((int64_t)arc_evictable_memory());
302f753f
BB
5261 if (sc->nr_to_scan == 0)
5262 return (pages);
3fd70ee6
BB
5263
5264 /* Not allowed to perform filesystem reclaim */
7e7baeca 5265 if (!(sc->gfp_mask & __GFP_FS))
ed6e9cc2 5266 return (SHRINK_STOP);
3fd70ee6 5267
7cb67b45 5268 /* Reclaim in progress */
b855550c
DB
5269 if (mutex_tryenter(&arc_reclaim_lock) == 0) {
5270 ARCSTAT_INCR(arcstat_need_free, ptob(sc->nr_to_scan));
2e91c2fb 5271 return (0);
b855550c 5272 }
7cb67b45 5273
ca0bf58d
PS
5274 mutex_exit(&arc_reclaim_lock);
5275
302f753f
BB
5276 /*
5277 * Evict the requested number of pages by shrinking arc_c the
44813aef 5278 * requested amount.
302f753f
BB
5279 */
5280 if (pages > 0) {
ca67b33a 5281 arc_shrink(ptob(sc->nr_to_scan));
44813aef
DB
5282 if (current_is_kswapd())
5283 arc_kmem_reap_now();
ed6e9cc2 5284#ifdef HAVE_SPLIT_SHRINKER_CALLBACK
4149bf49
DB
5285 pages = MAX((int64_t)pages -
5286 (int64_t)btop(arc_evictable_memory()), 0);
ed6e9cc2 5287#else
1e3cb67b 5288 pages = btop(arc_evictable_memory());
ed6e9cc2 5289#endif
1a31dcf5
DB
5290 /*
5291 * We've shrunk what we can, wake up threads.
5292 */
5293 cv_broadcast(&arc_reclaim_waiters_cv);
44813aef 5294 } else
ed6e9cc2 5295 pages = SHRINK_STOP;
302f753f
BB
5296
5297 /*
5298 * When direct reclaim is observed it usually indicates a rapid
5299 * increase in memory pressure. This occurs because the kswapd
5300 * threads were unable to asynchronously keep enough free memory
5301 * available. In this case set arc_no_grow to briefly pause arc
5302 * growth to avoid compounding the memory pressure.
5303 */
7cb67b45 5304 if (current_is_kswapd()) {
302f753f 5305 ARCSTAT_BUMP(arcstat_memory_indirect_count);
7cb67b45 5306 } else {
302f753f 5307 arc_no_grow = B_TRUE;
44813aef 5308 arc_kmem_reap_now();
302f753f 5309 ARCSTAT_BUMP(arcstat_memory_direct_count);
7cb67b45
BB
5310 }
5311
1e3cb67b 5312 return (pages);
7cb67b45 5313}
7e7baeca 5314SPL_SHRINKER_CALLBACK_WRAPPER(arc_shrinker_func);
7cb67b45
BB
5315
5316SPL_SHRINKER_DECLARE(arc_shrinker, arc_shrinker_func, DEFAULT_SEEKS);
5317#endif /* _KERNEL */
5318
34dc7c2f
BB
5319/*
5320 * Adapt arc info given the number of bytes we are trying to add and
4e33ba4c 5321 * the state that we are coming from. This function is only called
34dc7c2f
BB
5322 * when we are adding new content to the cache.
5323 */
5324static void
5325arc_adapt(int bytes, arc_state_t *state)
5326{
5327 int mult;
728d6ae9 5328 uint64_t arc_p_min = (arc_c >> arc_p_min_shift);
36da08ef
PS
5329 int64_t mrug_size = refcount_count(&arc_mru_ghost->arcs_size);
5330 int64_t mfug_size = refcount_count(&arc_mfu_ghost->arcs_size);
34dc7c2f
BB
5331
5332 if (state == arc_l2c_only)
5333 return;
5334
5335 ASSERT(bytes > 0);
5336 /*
5337 * Adapt the target size of the MRU list:
5338 * - if we just hit in the MRU ghost list, then increase
5339 * the target size of the MRU list.
5340 * - if we just hit in the MFU ghost list, then increase
5341 * the target size of the MFU list by decreasing the
5342 * target size of the MRU list.
5343 */
5344 if (state == arc_mru_ghost) {
36da08ef 5345 mult = (mrug_size >= mfug_size) ? 1 : (mfug_size / mrug_size);
62422785
PS
5346 if (!zfs_arc_p_dampener_disable)
5347 mult = MIN(mult, 10); /* avoid wild arc_p adjustment */
34dc7c2f 5348
728d6ae9 5349 arc_p = MIN(arc_c - arc_p_min, arc_p + bytes * mult);
34dc7c2f 5350 } else if (state == arc_mfu_ghost) {
d164b209
BB
5351 uint64_t delta;
5352
36da08ef 5353 mult = (mfug_size >= mrug_size) ? 1 : (mrug_size / mfug_size);
62422785
PS
5354 if (!zfs_arc_p_dampener_disable)
5355 mult = MIN(mult, 10);
34dc7c2f 5356
d164b209 5357 delta = MIN(bytes * mult, arc_p);
728d6ae9 5358 arc_p = MAX(arc_p_min, arc_p - delta);
34dc7c2f
BB
5359 }
5360 ASSERT((int64_t)arc_p >= 0);
5361
ca67b33a
MA
5362 if (arc_reclaim_needed()) {
5363 cv_signal(&arc_reclaim_thread_cv);
5364 return;
5365 }
5366
34dc7c2f
BB
5367 if (arc_no_grow)
5368 return;
5369
5370 if (arc_c >= arc_c_max)
5371 return;
5372
5373 /*
5374 * If we're within (2 * maxblocksize) bytes of the target
5375 * cache size, increment the target cache size
5376 */
935434ef 5377 ASSERT3U(arc_c, >=, 2ULL << SPA_MAXBLOCKSHIFT);
37fb3e43
PD
5378 if (aggsum_compare(&arc_size, arc_c - (2ULL << SPA_MAXBLOCKSHIFT)) >=
5379 0) {
34dc7c2f
BB
5380 atomic_add_64(&arc_c, (int64_t)bytes);
5381 if (arc_c > arc_c_max)
5382 arc_c = arc_c_max;
5383 else if (state == arc_anon)
5384 atomic_add_64(&arc_p, (int64_t)bytes);
5385 if (arc_p > arc_c)
5386 arc_p = arc_c;
5387 }
5388 ASSERT((int64_t)arc_p >= 0);
5389}
5390
5391/*
ca0bf58d
PS
5392 * Check if arc_size has grown past our upper threshold, determined by
5393 * zfs_arc_overflow_shift.
34dc7c2f 5394 */
ca0bf58d
PS
5395static boolean_t
5396arc_is_overflowing(void)
34dc7c2f 5397{
ca0bf58d
PS
5398 /* Always allow at least one block of overflow */
5399 uint64_t overflow = MAX(SPA_MAXBLOCKSIZE,
5400 arc_c >> zfs_arc_overflow_shift);
34dc7c2f 5401
37fb3e43
PD
5402 /*
5403 * We just compare the lower bound here for performance reasons. Our
5404 * primary goals are to make sure that the arc never grows without
5405 * bound, and that it can reach its maximum size. This check
5406 * accomplishes both goals. The maximum amount we could run over by is
5407 * 2 * aggsum_borrow_multiplier * NUM_CPUS * the average size of a block
5408 * in the ARC. In practice, that's in the tens of MB, which is low
5409 * enough to be safe.
5410 */
5411 return (aggsum_lower_bound(&arc_size) >= arc_c + overflow);
34dc7c2f
BB
5412}
5413
a6255b7f
DQ
5414static abd_t *
5415arc_get_data_abd(arc_buf_hdr_t *hdr, uint64_t size, void *tag)
5416{
5417 arc_buf_contents_t type = arc_buf_type(hdr);
5418
5419 arc_get_data_impl(hdr, size, tag);
5420 if (type == ARC_BUFC_METADATA) {
5421 return (abd_alloc(size, B_TRUE));
5422 } else {
5423 ASSERT(type == ARC_BUFC_DATA);
5424 return (abd_alloc(size, B_FALSE));
5425 }
5426}
5427
5428static void *
5429arc_get_data_buf(arc_buf_hdr_t *hdr, uint64_t size, void *tag)
5430{
5431 arc_buf_contents_t type = arc_buf_type(hdr);
5432
5433 arc_get_data_impl(hdr, size, tag);
5434 if (type == ARC_BUFC_METADATA) {
5435 return (zio_buf_alloc(size));
5436 } else {
5437 ASSERT(type == ARC_BUFC_DATA);
5438 return (zio_data_buf_alloc(size));
5439 }
5440}
5441
34dc7c2f 5442/*
d3c2ae1c
GW
5443 * Allocate a block and return it to the caller. If we are hitting the
5444 * hard limit for the cache size, we must sleep, waiting for the eviction
5445 * thread to catch up. If we're past the target size but below the hard
5446 * limit, we'll only signal the reclaim thread and continue on.
34dc7c2f 5447 */
a6255b7f
DQ
5448static void
5449arc_get_data_impl(arc_buf_hdr_t *hdr, uint64_t size, void *tag)
34dc7c2f 5450{
a6255b7f
DQ
5451 arc_state_t *state = hdr->b_l1hdr.b_state;
5452 arc_buf_contents_t type = arc_buf_type(hdr);
34dc7c2f
BB
5453
5454 arc_adapt(size, state);
5455
5456 /*
ca0bf58d
PS
5457 * If arc_size is currently overflowing, and has grown past our
5458 * upper limit, we must be adding data faster than the evict
5459 * thread can evict. Thus, to ensure we don't compound the
5460 * problem by adding more data and forcing arc_size to grow even
5461 * further past it's target size, we halt and wait for the
5462 * eviction thread to catch up.
5463 *
5464 * It's also possible that the reclaim thread is unable to evict
5465 * enough buffers to get arc_size below the overflow limit (e.g.
5466 * due to buffers being un-evictable, or hash lock collisions).
5467 * In this case, we want to proceed regardless if we're
5468 * overflowing; thus we don't use a while loop here.
34dc7c2f 5469 */
ca0bf58d
PS
5470 if (arc_is_overflowing()) {
5471 mutex_enter(&arc_reclaim_lock);
5472
5473 /*
5474 * Now that we've acquired the lock, we may no longer be
5475 * over the overflow limit, lets check.
5476 *
5477 * We're ignoring the case of spurious wake ups. If that
5478 * were to happen, it'd let this thread consume an ARC
5479 * buffer before it should have (i.e. before we're under
5480 * the overflow limit and were signalled by the reclaim
5481 * thread). As long as that is a rare occurrence, it
5482 * shouldn't cause any harm.
5483 */
5484 if (arc_is_overflowing()) {
5485 cv_signal(&arc_reclaim_thread_cv);
5486 cv_wait(&arc_reclaim_waiters_cv, &arc_reclaim_lock);
34dc7c2f 5487 }
34dc7c2f 5488
ca0bf58d 5489 mutex_exit(&arc_reclaim_lock);
34dc7c2f 5490 }
ab26409d 5491
d3c2ae1c 5492 VERIFY3U(hdr->b_type, ==, type);
da8ccd0e 5493 if (type == ARC_BUFC_METADATA) {
ca0bf58d
PS
5494 arc_space_consume(size, ARC_SPACE_META);
5495 } else {
ca0bf58d 5496 arc_space_consume(size, ARC_SPACE_DATA);
da8ccd0e
PS
5497 }
5498
34dc7c2f
BB
5499 /*
5500 * Update the state size. Note that ghost states have a
5501 * "ghost size" and so don't need to be updated.
5502 */
d3c2ae1c 5503 if (!GHOST_STATE(state)) {
34dc7c2f 5504
d3c2ae1c 5505 (void) refcount_add_many(&state->arcs_size, size, tag);
ca0bf58d
PS
5506
5507 /*
5508 * If this is reached via arc_read, the link is
5509 * protected by the hash lock. If reached via
5510 * arc_buf_alloc, the header should not be accessed by
5511 * any other thread. And, if reached via arc_read_done,
5512 * the hash lock will protect it if it's found in the
5513 * hash table; otherwise no other thread should be
5514 * trying to [add|remove]_reference it.
5515 */
5516 if (multilist_link_active(&hdr->b_l1hdr.b_arc_node)) {
b9541d6b 5517 ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
d3c2ae1c
GW
5518 (void) refcount_add_many(&state->arcs_esize[type],
5519 size, tag);
34dc7c2f 5520 }
d3c2ae1c 5521
34dc7c2f
BB
5522 /*
5523 * If we are growing the cache, and we are adding anonymous
5524 * data, and we have outgrown arc_p, update arc_p
5525 */
37fb3e43
PD
5526 if (aggsum_compare(&arc_size, arc_c) < 0 &&
5527 hdr->b_l1hdr.b_state == arc_anon &&
36da08ef
PS
5528 (refcount_count(&arc_anon->arcs_size) +
5529 refcount_count(&arc_mru->arcs_size) > arc_p))
34dc7c2f
BB
5530 arc_p = MIN(arc_c, arc_p + size);
5531 }
a6255b7f
DQ
5532}
5533
5534static void
5535arc_free_data_abd(arc_buf_hdr_t *hdr, abd_t *abd, uint64_t size, void *tag)
5536{
5537 arc_free_data_impl(hdr, size, tag);
5538 abd_free(abd);
5539}
5540
5541static void
5542arc_free_data_buf(arc_buf_hdr_t *hdr, void *buf, uint64_t size, void *tag)
5543{
5544 arc_buf_contents_t type = arc_buf_type(hdr);
5545
5546 arc_free_data_impl(hdr, size, tag);
5547 if (type == ARC_BUFC_METADATA) {
5548 zio_buf_free(buf, size);
5549 } else {
5550 ASSERT(type == ARC_BUFC_DATA);
5551 zio_data_buf_free(buf, size);
5552 }
d3c2ae1c
GW
5553}
5554
5555/*
5556 * Free the arc data buffer.
5557 */
5558static void
a6255b7f 5559arc_free_data_impl(arc_buf_hdr_t *hdr, uint64_t size, void *tag)
d3c2ae1c
GW
5560{
5561 arc_state_t *state = hdr->b_l1hdr.b_state;
5562 arc_buf_contents_t type = arc_buf_type(hdr);
5563
5564 /* protected by hash lock, if in the hash table */
5565 if (multilist_link_active(&hdr->b_l1hdr.b_arc_node)) {
5566 ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
5567 ASSERT(state != arc_anon && state != arc_l2c_only);
5568
5569 (void) refcount_remove_many(&state->arcs_esize[type],
5570 size, tag);
5571 }
5572 (void) refcount_remove_many(&state->arcs_size, size, tag);
5573
5574 VERIFY3U(hdr->b_type, ==, type);
5575 if (type == ARC_BUFC_METADATA) {
d3c2ae1c
GW
5576 arc_space_return(size, ARC_SPACE_META);
5577 } else {
5578 ASSERT(type == ARC_BUFC_DATA);
d3c2ae1c
GW
5579 arc_space_return(size, ARC_SPACE_DATA);
5580 }
34dc7c2f
BB
5581}
5582
5583/*
5584 * This routine is called whenever a buffer is accessed.
5585 * NOTE: the hash lock is dropped in this function.
5586 */
5587static void
2a432414 5588arc_access(arc_buf_hdr_t *hdr, kmutex_t *hash_lock)
34dc7c2f 5589{
428870ff
BB
5590 clock_t now;
5591
34dc7c2f 5592 ASSERT(MUTEX_HELD(hash_lock));
b9541d6b 5593 ASSERT(HDR_HAS_L1HDR(hdr));
34dc7c2f 5594
b9541d6b 5595 if (hdr->b_l1hdr.b_state == arc_anon) {
34dc7c2f
BB
5596 /*
5597 * This buffer is not in the cache, and does not
5598 * appear in our "ghost" list. Add the new buffer
5599 * to the MRU state.
5600 */
5601
b9541d6b
CW
5602 ASSERT0(hdr->b_l1hdr.b_arc_access);
5603 hdr->b_l1hdr.b_arc_access = ddi_get_lbolt();
2a432414
GW
5604 DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, hdr);
5605 arc_change_state(arc_mru, hdr, hash_lock);
34dc7c2f 5606
b9541d6b 5607 } else if (hdr->b_l1hdr.b_state == arc_mru) {
428870ff
BB
5608 now = ddi_get_lbolt();
5609
34dc7c2f
BB
5610 /*
5611 * If this buffer is here because of a prefetch, then either:
5612 * - clear the flag if this is a "referencing" read
5613 * (any subsequent access will bump this into the MFU state).
5614 * or
5615 * - move the buffer to the head of the list if this is
5616 * another prefetch (to make it less likely to be evicted).
5617 */
d4a72f23 5618 if (HDR_PREFETCH(hdr) || HDR_PRESCIENT_PREFETCH(hdr)) {
b9541d6b 5619 if (refcount_count(&hdr->b_l1hdr.b_refcnt) == 0) {
ca0bf58d
PS
5620 /* link protected by hash lock */
5621 ASSERT(multilist_link_active(
b9541d6b 5622 &hdr->b_l1hdr.b_arc_node));
34dc7c2f 5623 } else {
d4a72f23
TC
5624 arc_hdr_clear_flags(hdr,
5625 ARC_FLAG_PREFETCH |
5626 ARC_FLAG_PRESCIENT_PREFETCH);
b9541d6b 5627 atomic_inc_32(&hdr->b_l1hdr.b_mru_hits);
34dc7c2f
BB
5628 ARCSTAT_BUMP(arcstat_mru_hits);
5629 }
b9541d6b 5630 hdr->b_l1hdr.b_arc_access = now;
34dc7c2f
BB
5631 return;
5632 }
5633
5634 /*
5635 * This buffer has been "accessed" only once so far,
5636 * but it is still in the cache. Move it to the MFU
5637 * state.
5638 */
b9541d6b
CW
5639 if (ddi_time_after(now, hdr->b_l1hdr.b_arc_access +
5640 ARC_MINTIME)) {
34dc7c2f
BB
5641 /*
5642 * More than 125ms have passed since we
5643 * instantiated this buffer. Move it to the
5644 * most frequently used state.
5645 */
b9541d6b 5646 hdr->b_l1hdr.b_arc_access = now;
2a432414
GW
5647 DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr);
5648 arc_change_state(arc_mfu, hdr, hash_lock);
34dc7c2f 5649 }
b9541d6b 5650 atomic_inc_32(&hdr->b_l1hdr.b_mru_hits);
34dc7c2f 5651 ARCSTAT_BUMP(arcstat_mru_hits);
b9541d6b 5652 } else if (hdr->b_l1hdr.b_state == arc_mru_ghost) {
34dc7c2f
BB
5653 arc_state_t *new_state;
5654 /*
5655 * This buffer has been "accessed" recently, but
5656 * was evicted from the cache. Move it to the
5657 * MFU state.
5658 */
5659
d4a72f23 5660 if (HDR_PREFETCH(hdr) || HDR_PRESCIENT_PREFETCH(hdr)) {
34dc7c2f 5661 new_state = arc_mru;
d4a72f23
TC
5662 if (refcount_count(&hdr->b_l1hdr.b_refcnt) > 0) {
5663 arc_hdr_clear_flags(hdr,
5664 ARC_FLAG_PREFETCH |
5665 ARC_FLAG_PRESCIENT_PREFETCH);
5666 }
2a432414 5667 DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, hdr);
34dc7c2f
BB
5668 } else {
5669 new_state = arc_mfu;
2a432414 5670 DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr);
34dc7c2f
BB
5671 }
5672
b9541d6b 5673 hdr->b_l1hdr.b_arc_access = ddi_get_lbolt();
2a432414 5674 arc_change_state(new_state, hdr, hash_lock);
34dc7c2f 5675
b9541d6b 5676 atomic_inc_32(&hdr->b_l1hdr.b_mru_ghost_hits);
34dc7c2f 5677 ARCSTAT_BUMP(arcstat_mru_ghost_hits);
b9541d6b 5678 } else if (hdr->b_l1hdr.b_state == arc_mfu) {
34dc7c2f
BB
5679 /*
5680 * This buffer has been accessed more than once and is
5681 * still in the cache. Keep it in the MFU state.
5682 *
5683 * NOTE: an add_reference() that occurred when we did
5684 * the arc_read() will have kicked this off the list.
5685 * If it was a prefetch, we will explicitly move it to
5686 * the head of the list now.
5687 */
d4a72f23 5688
b9541d6b 5689 atomic_inc_32(&hdr->b_l1hdr.b_mfu_hits);
34dc7c2f 5690 ARCSTAT_BUMP(arcstat_mfu_hits);
b9541d6b
CW
5691 hdr->b_l1hdr.b_arc_access = ddi_get_lbolt();
5692 } else if (hdr->b_l1hdr.b_state == arc_mfu_ghost) {
34dc7c2f
BB
5693 arc_state_t *new_state = arc_mfu;
5694 /*
5695 * This buffer has been accessed more than once but has
5696 * been evicted from the cache. Move it back to the
5697 * MFU state.
5698 */
5699
d4a72f23 5700 if (HDR_PREFETCH(hdr) || HDR_PRESCIENT_PREFETCH(hdr)) {
34dc7c2f
BB
5701 /*
5702 * This is a prefetch access...
5703 * move this block back to the MRU state.
5704 */
34dc7c2f
BB
5705 new_state = arc_mru;
5706 }
5707
b9541d6b 5708 hdr->b_l1hdr.b_arc_access = ddi_get_lbolt();
2a432414
GW
5709 DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr);
5710 arc_change_state(new_state, hdr, hash_lock);
34dc7c2f 5711
b9541d6b 5712 atomic_inc_32(&hdr->b_l1hdr.b_mfu_ghost_hits);
34dc7c2f 5713 ARCSTAT_BUMP(arcstat_mfu_ghost_hits);
b9541d6b 5714 } else if (hdr->b_l1hdr.b_state == arc_l2c_only) {
34dc7c2f
BB
5715 /*
5716 * This buffer is on the 2nd Level ARC.
5717 */
5718
b9541d6b 5719 hdr->b_l1hdr.b_arc_access = ddi_get_lbolt();
2a432414
GW
5720 DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr);
5721 arc_change_state(arc_mfu, hdr, hash_lock);
34dc7c2f 5722 } else {
b9541d6b
CW
5723 cmn_err(CE_PANIC, "invalid arc state 0x%p",
5724 hdr->b_l1hdr.b_state);
34dc7c2f
BB
5725 }
5726}
5727
0873bb63
BB
5728/*
5729 * This routine is called by dbuf_hold() to update the arc_access() state
5730 * which otherwise would be skipped for entries in the dbuf cache.
5731 */
5732void
5733arc_buf_access(arc_buf_t *buf)
5734{
5735 mutex_enter(&buf->b_evict_lock);
5736 arc_buf_hdr_t *hdr = buf->b_hdr;
5737
5738 /*
5739 * Avoid taking the hash_lock when possible as an optimization.
5740 * The header must be checked again under the hash_lock in order
5741 * to handle the case where it is concurrently being released.
5742 */
5743 if (hdr->b_l1hdr.b_state == arc_anon || HDR_EMPTY(hdr)) {
5744 mutex_exit(&buf->b_evict_lock);
5745 return;
5746 }
5747
5748 kmutex_t *hash_lock = HDR_LOCK(hdr);
5749 mutex_enter(hash_lock);
5750
5751 if (hdr->b_l1hdr.b_state == arc_anon || HDR_EMPTY(hdr)) {
5752 mutex_exit(hash_lock);
5753 mutex_exit(&buf->b_evict_lock);
5754 ARCSTAT_BUMP(arcstat_access_skip);
5755 return;
5756 }
5757
5758 mutex_exit(&buf->b_evict_lock);
5759
5760 ASSERT(hdr->b_l1hdr.b_state == arc_mru ||
5761 hdr->b_l1hdr.b_state == arc_mfu);
5762
5763 DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr);
5764 arc_access(hdr, hash_lock);
5765 mutex_exit(hash_lock);
5766
5767 ARCSTAT_BUMP(arcstat_hits);
5768 ARCSTAT_CONDSTAT(!HDR_PREFETCH(hdr) && !HDR_PRESCIENT_PREFETCH(hdr),
5769 demand, prefetch, !HDR_ISTYPE_METADATA(hdr), data, metadata, hits);
5770}
5771
b5256303 5772/* a generic arc_read_done_func_t which you can use */
34dc7c2f
BB
5773/* ARGSUSED */
5774void
d4a72f23
TC
5775arc_bcopy_func(zio_t *zio, const zbookmark_phys_t *zb, const blkptr_t *bp,
5776 arc_buf_t *buf, void *arg)
34dc7c2f 5777{
d4a72f23
TC
5778 if (buf == NULL)
5779 return;
5780
5781 bcopy(buf->b_data, arg, arc_buf_size(buf));
d3c2ae1c 5782 arc_buf_destroy(buf, arg);
34dc7c2f
BB
5783}
5784
b5256303 5785/* a generic arc_read_done_func_t */
d4a72f23 5786/* ARGSUSED */
34dc7c2f 5787void
d4a72f23
TC
5788arc_getbuf_func(zio_t *zio, const zbookmark_phys_t *zb, const blkptr_t *bp,
5789 arc_buf_t *buf, void *arg)
34dc7c2f
BB
5790{
5791 arc_buf_t **bufp = arg;
d4a72f23
TC
5792
5793 if (buf == NULL) {
c3bd3fb4 5794 ASSERT(zio == NULL || zio->io_error != 0);
34dc7c2f
BB
5795 *bufp = NULL;
5796 } else {
c3bd3fb4 5797 ASSERT(zio == NULL || zio->io_error == 0);
34dc7c2f 5798 *bufp = buf;
c3bd3fb4 5799 ASSERT(buf->b_data != NULL);
34dc7c2f
BB
5800 }
5801}
5802
d3c2ae1c
GW
5803static void
5804arc_hdr_verify(arc_buf_hdr_t *hdr, blkptr_t *bp)
5805{
5806 if (BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp)) {
5807 ASSERT3U(HDR_GET_PSIZE(hdr), ==, 0);
b5256303 5808 ASSERT3U(arc_hdr_get_compress(hdr), ==, ZIO_COMPRESS_OFF);
d3c2ae1c
GW
5809 } else {
5810 if (HDR_COMPRESSION_ENABLED(hdr)) {
b5256303 5811 ASSERT3U(arc_hdr_get_compress(hdr), ==,
d3c2ae1c
GW
5812 BP_GET_COMPRESS(bp));
5813 }
5814 ASSERT3U(HDR_GET_LSIZE(hdr), ==, BP_GET_LSIZE(bp));
5815 ASSERT3U(HDR_GET_PSIZE(hdr), ==, BP_GET_PSIZE(bp));
b5256303 5816 ASSERT3U(!!HDR_PROTECTED(hdr), ==, BP_IS_PROTECTED(bp));
d3c2ae1c
GW
5817 }
5818}
5819
34dc7c2f
BB
5820static void
5821arc_read_done(zio_t *zio)
5822{
b5256303 5823 blkptr_t *bp = zio->io_bp;
d3c2ae1c 5824 arc_buf_hdr_t *hdr = zio->io_private;
9b67f605 5825 kmutex_t *hash_lock = NULL;
524b4217
DK
5826 arc_callback_t *callback_list;
5827 arc_callback_t *acb;
2aa34383 5828 boolean_t freeable = B_FALSE;
a7004725 5829
34dc7c2f
BB
5830 /*
5831 * The hdr was inserted into hash-table and removed from lists
5832 * prior to starting I/O. We should find this header, since
5833 * it's in the hash table, and it should be legit since it's
5834 * not possible to evict it during the I/O. The only possible
5835 * reason for it not to be found is if we were freed during the
5836 * read.
5837 */
9b67f605 5838 if (HDR_IN_HASH_TABLE(hdr)) {
31df97cd
DB
5839 arc_buf_hdr_t *found;
5840
9b67f605
MA
5841 ASSERT3U(hdr->b_birth, ==, BP_PHYSICAL_BIRTH(zio->io_bp));
5842 ASSERT3U(hdr->b_dva.dva_word[0], ==,
5843 BP_IDENTITY(zio->io_bp)->dva_word[0]);
5844 ASSERT3U(hdr->b_dva.dva_word[1], ==,
5845 BP_IDENTITY(zio->io_bp)->dva_word[1]);
5846
31df97cd 5847 found = buf_hash_find(hdr->b_spa, zio->io_bp, &hash_lock);
9b67f605 5848
d3c2ae1c 5849 ASSERT((found == hdr &&
9b67f605
MA
5850 DVA_EQUAL(&hdr->b_dva, BP_IDENTITY(zio->io_bp))) ||
5851 (found == hdr && HDR_L2_READING(hdr)));
d3c2ae1c
GW
5852 ASSERT3P(hash_lock, !=, NULL);
5853 }
5854
b5256303
TC
5855 if (BP_IS_PROTECTED(bp)) {
5856 hdr->b_crypt_hdr.b_ot = BP_GET_TYPE(bp);
5857 hdr->b_crypt_hdr.b_dsobj = zio->io_bookmark.zb_objset;
5858 zio_crypt_decode_params_bp(bp, hdr->b_crypt_hdr.b_salt,
5859 hdr->b_crypt_hdr.b_iv);
5860
5861 if (BP_GET_TYPE(bp) == DMU_OT_INTENT_LOG) {
5862 void *tmpbuf;
5863
5864 tmpbuf = abd_borrow_buf_copy(zio->io_abd,
5865 sizeof (zil_chain_t));
5866 zio_crypt_decode_mac_zil(tmpbuf,
5867 hdr->b_crypt_hdr.b_mac);
5868 abd_return_buf(zio->io_abd, tmpbuf,
5869 sizeof (zil_chain_t));
5870 } else {
5871 zio_crypt_decode_mac_bp(bp, hdr->b_crypt_hdr.b_mac);
5872 }
5873 }
5874
d4a72f23 5875 if (zio->io_error == 0) {
d3c2ae1c
GW
5876 /* byteswap if necessary */
5877 if (BP_SHOULD_BYTESWAP(zio->io_bp)) {
5878 if (BP_GET_LEVEL(zio->io_bp) > 0) {
5879 hdr->b_l1hdr.b_byteswap = DMU_BSWAP_UINT64;
5880 } else {
5881 hdr->b_l1hdr.b_byteswap =
5882 DMU_OT_BYTESWAP(BP_GET_TYPE(zio->io_bp));
5883 }
5884 } else {
5885 hdr->b_l1hdr.b_byteswap = DMU_BSWAP_NUMFUNCS;
5886 }
9b67f605 5887 }
34dc7c2f 5888
d3c2ae1c 5889 arc_hdr_clear_flags(hdr, ARC_FLAG_L2_EVICTED);
b9541d6b 5890 if (l2arc_noprefetch && HDR_PREFETCH(hdr))
d3c2ae1c 5891 arc_hdr_clear_flags(hdr, ARC_FLAG_L2CACHE);
34dc7c2f 5892
b9541d6b 5893 callback_list = hdr->b_l1hdr.b_acb;
d3c2ae1c 5894 ASSERT3P(callback_list, !=, NULL);
34dc7c2f 5895
d4a72f23
TC
5896 if (hash_lock && zio->io_error == 0 &&
5897 hdr->b_l1hdr.b_state == arc_anon) {
428870ff
BB
5898 /*
5899 * Only call arc_access on anonymous buffers. This is because
5900 * if we've issued an I/O for an evicted buffer, we've already
5901 * called arc_access (to prevent any simultaneous readers from
5902 * getting confused).
5903 */
5904 arc_access(hdr, hash_lock);
5905 }
5906
524b4217
DK
5907 /*
5908 * If a read request has a callback (i.e. acb_done is not NULL), then we
5909 * make a buf containing the data according to the parameters which were
5910 * passed in. The implementation of arc_buf_alloc_impl() ensures that we
5911 * aren't needlessly decompressing the data multiple times.
5912 */
a7004725 5913 int callback_cnt = 0;
2aa34383
DK
5914 for (acb = callback_list; acb != NULL; acb = acb->acb_next) {
5915 if (!acb->acb_done)
5916 continue;
5917
2aa34383 5918 callback_cnt++;
524b4217 5919
d4a72f23
TC
5920 if (zio->io_error != 0)
5921 continue;
5922
b5256303 5923 int error = arc_buf_alloc_impl(hdr, zio->io_spa,
be9a5c35 5924 &acb->acb_zb, acb->acb_private, acb->acb_encrypted,
d4a72f23 5925 acb->acb_compressed, acb->acb_noauth, B_TRUE,
440a3eb9 5926 &acb->acb_buf);
b5256303
TC
5927
5928 /*
440a3eb9 5929 * Assert non-speculative zios didn't fail because an
b5256303
TC
5930 * encryption key wasn't loaded
5931 */
a2c2ed1b 5932 ASSERT((zio->io_flags & ZIO_FLAG_SPECULATIVE) ||
be9a5c35 5933 error != EACCES);
b5256303
TC
5934
5935 /*
5936 * If we failed to decrypt, report an error now (as the zio
5937 * layer would have done if it had done the transforms).
5938 */
5939 if (error == ECKSUM) {
5940 ASSERT(BP_IS_PROTECTED(bp));
5941 error = SET_ERROR(EIO);
b5256303 5942 if ((zio->io_flags & ZIO_FLAG_SPECULATIVE) == 0) {
be9a5c35 5943 spa_log_error(zio->io_spa, &acb->acb_zb);
b5256303 5944 zfs_ereport_post(FM_EREPORT_ZFS_AUTHENTICATION,
be9a5c35 5945 zio->io_spa, NULL, &acb->acb_zb, zio, 0, 0);
b5256303
TC
5946 }
5947 }
5948
c3bd3fb4
TC
5949 if (error != 0) {
5950 /*
5951 * Decompression or decryption failed. Set
5952 * io_error so that when we call acb_done
5953 * (below), we will indicate that the read
5954 * failed. Note that in the unusual case
5955 * where one callback is compressed and another
5956 * uncompressed, we will mark all of them
5957 * as failed, even though the uncompressed
5958 * one can't actually fail. In this case,
5959 * the hdr will not be anonymous, because
5960 * if there are multiple callbacks, it's
5961 * because multiple threads found the same
5962 * arc buf in the hash table.
5963 */
524b4217 5964 zio->io_error = error;
c3bd3fb4 5965 }
34dc7c2f 5966 }
c3bd3fb4
TC
5967
5968 /*
5969 * If there are multiple callbacks, we must have the hash lock,
5970 * because the only way for multiple threads to find this hdr is
5971 * in the hash table. This ensures that if there are multiple
5972 * callbacks, the hdr is not anonymous. If it were anonymous,
5973 * we couldn't use arc_buf_destroy() in the error case below.
5974 */
5975 ASSERT(callback_cnt < 2 || hash_lock != NULL);
5976
b9541d6b 5977 hdr->b_l1hdr.b_acb = NULL;
d3c2ae1c 5978 arc_hdr_clear_flags(hdr, ARC_FLAG_IO_IN_PROGRESS);
440a3eb9 5979 if (callback_cnt == 0)
b5256303 5980 ASSERT(hdr->b_l1hdr.b_pabd != NULL || HDR_HAS_RABD(hdr));
34dc7c2f 5981
b9541d6b
CW
5982 ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt) ||
5983 callback_list != NULL);
34dc7c2f 5984
d4a72f23 5985 if (zio->io_error == 0) {
d3c2ae1c
GW
5986 arc_hdr_verify(hdr, zio->io_bp);
5987 } else {
5988 arc_hdr_set_flags(hdr, ARC_FLAG_IO_ERROR);
b9541d6b 5989 if (hdr->b_l1hdr.b_state != arc_anon)
34dc7c2f
BB
5990 arc_change_state(arc_anon, hdr, hash_lock);
5991 if (HDR_IN_HASH_TABLE(hdr))
5992 buf_hash_remove(hdr);
b9541d6b 5993 freeable = refcount_is_zero(&hdr->b_l1hdr.b_refcnt);
34dc7c2f
BB
5994 }
5995
5996 /*
5997 * Broadcast before we drop the hash_lock to avoid the possibility
5998 * that the hdr (and hence the cv) might be freed before we get to
5999 * the cv_broadcast().
6000 */
b9541d6b 6001 cv_broadcast(&hdr->b_l1hdr.b_cv);
34dc7c2f 6002
b9541d6b 6003 if (hash_lock != NULL) {
34dc7c2f
BB
6004 mutex_exit(hash_lock);
6005 } else {
6006 /*
6007 * This block was freed while we waited for the read to
6008 * complete. It has been removed from the hash table and
6009 * moved to the anonymous state (so that it won't show up
6010 * in the cache).
6011 */
b9541d6b
CW
6012 ASSERT3P(hdr->b_l1hdr.b_state, ==, arc_anon);
6013 freeable = refcount_is_zero(&hdr->b_l1hdr.b_refcnt);
34dc7c2f
BB
6014 }
6015
6016 /* execute each callback and free its structure */
6017 while ((acb = callback_list) != NULL) {
c3bd3fb4
TC
6018 if (acb->acb_done != NULL) {
6019 if (zio->io_error != 0 && acb->acb_buf != NULL) {
6020 /*
6021 * If arc_buf_alloc_impl() fails during
6022 * decompression, the buf will still be
6023 * allocated, and needs to be freed here.
6024 */
6025 arc_buf_destroy(acb->acb_buf,
6026 acb->acb_private);
6027 acb->acb_buf = NULL;
6028 }
d4a72f23
TC
6029 acb->acb_done(zio, &zio->io_bookmark, zio->io_bp,
6030 acb->acb_buf, acb->acb_private);
b5256303 6031 }
34dc7c2f
BB
6032
6033 if (acb->acb_zio_dummy != NULL) {
6034 acb->acb_zio_dummy->io_error = zio->io_error;
6035 zio_nowait(acb->acb_zio_dummy);
6036 }
6037
6038 callback_list = acb->acb_next;
6039 kmem_free(acb, sizeof (arc_callback_t));
6040 }
6041
6042 if (freeable)
6043 arc_hdr_destroy(hdr);
6044}
6045
6046/*
5c839890 6047 * "Read" the block at the specified DVA (in bp) via the
34dc7c2f
BB
6048 * cache. If the block is found in the cache, invoke the provided
6049 * callback immediately and return. Note that the `zio' parameter
6050 * in the callback will be NULL in this case, since no IO was
6051 * required. If the block is not in the cache pass the read request
6052 * on to the spa with a substitute callback function, so that the
6053 * requested block will be added to the cache.
6054 *
6055 * If a read request arrives for a block that has a read in-progress,
6056 * either wait for the in-progress read to complete (and return the
6057 * results); or, if this is a read with a "done" func, add a record
6058 * to the read to invoke the "done" func when the read completes,
6059 * and return; or just return.
6060 *
6061 * arc_read_done() will invoke all the requested "done" functions
6062 * for readers of this block.
6063 */
6064int
b5256303
TC
6065arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp,
6066 arc_read_done_func_t *done, void *private, zio_priority_t priority,
6067 int zio_flags, arc_flags_t *arc_flags, const zbookmark_phys_t *zb)
34dc7c2f 6068{
9b67f605 6069 arc_buf_hdr_t *hdr = NULL;
9b67f605 6070 kmutex_t *hash_lock = NULL;
34dc7c2f 6071 zio_t *rzio;
3541dc6d 6072 uint64_t guid = spa_load_guid(spa);
b5256303
TC
6073 boolean_t compressed_read = (zio_flags & ZIO_FLAG_RAW_COMPRESS) != 0;
6074 boolean_t encrypted_read = BP_IS_ENCRYPTED(bp) &&
6075 (zio_flags & ZIO_FLAG_RAW_ENCRYPT) != 0;
6076 boolean_t noauth_read = BP_IS_AUTHENTICATED(bp) &&
6077 (zio_flags & ZIO_FLAG_RAW_ENCRYPT) != 0;
1421c891 6078 int rc = 0;
34dc7c2f 6079
9b67f605
MA
6080 ASSERT(!BP_IS_EMBEDDED(bp) ||
6081 BPE_GET_ETYPE(bp) == BP_EMBEDDED_TYPE_DATA);
6082
34dc7c2f 6083top:
9b67f605
MA
6084 if (!BP_IS_EMBEDDED(bp)) {
6085 /*
6086 * Embedded BP's have no DVA and require no I/O to "read".
6087 * Create an anonymous arc buf to back it.
6088 */
6089 hdr = buf_hash_find(guid, bp, &hash_lock);
6090 }
6091
b5256303
TC
6092 /*
6093 * Determine if we have an L1 cache hit or a cache miss. For simplicity
6094 * we maintain encrypted data seperately from compressed / uncompressed
6095 * data. If the user is requesting raw encrypted data and we don't have
6096 * that in the header we will read from disk to guarantee that we can
6097 * get it even if the encryption keys aren't loaded.
6098 */
6099 if (hdr != NULL && HDR_HAS_L1HDR(hdr) && (HDR_HAS_RABD(hdr) ||
6100 (hdr->b_l1hdr.b_pabd != NULL && !encrypted_read))) {
d3c2ae1c 6101 arc_buf_t *buf = NULL;
2a432414 6102 *arc_flags |= ARC_FLAG_CACHED;
34dc7c2f
BB
6103
6104 if (HDR_IO_IN_PROGRESS(hdr)) {
a8b2e306 6105 zio_t *head_zio = hdr->b_l1hdr.b_acb->acb_zio_head;
34dc7c2f 6106
a8b2e306 6107 ASSERT3P(head_zio, !=, NULL);
7f60329a
MA
6108 if ((hdr->b_flags & ARC_FLAG_PRIO_ASYNC_READ) &&
6109 priority == ZIO_PRIORITY_SYNC_READ) {
6110 /*
a8b2e306
TC
6111 * This is a sync read that needs to wait for
6112 * an in-flight async read. Request that the
6113 * zio have its priority upgraded.
7f60329a 6114 */
a8b2e306
TC
6115 zio_change_priority(head_zio, priority);
6116 DTRACE_PROBE1(arc__async__upgrade__sync,
7f60329a 6117 arc_buf_hdr_t *, hdr);
a8b2e306 6118 ARCSTAT_BUMP(arcstat_async_upgrade_sync);
7f60329a
MA
6119 }
6120 if (hdr->b_flags & ARC_FLAG_PREDICTIVE_PREFETCH) {
d3c2ae1c
GW
6121 arc_hdr_clear_flags(hdr,
6122 ARC_FLAG_PREDICTIVE_PREFETCH);
7f60329a
MA
6123 }
6124
2a432414 6125 if (*arc_flags & ARC_FLAG_WAIT) {
b9541d6b 6126 cv_wait(&hdr->b_l1hdr.b_cv, hash_lock);
34dc7c2f
BB
6127 mutex_exit(hash_lock);
6128 goto top;
6129 }
2a432414 6130 ASSERT(*arc_flags & ARC_FLAG_NOWAIT);
34dc7c2f
BB
6131
6132 if (done) {
7f60329a 6133 arc_callback_t *acb = NULL;
34dc7c2f
BB
6134
6135 acb = kmem_zalloc(sizeof (arc_callback_t),
79c76d5b 6136 KM_SLEEP);
34dc7c2f
BB
6137 acb->acb_done = done;
6138 acb->acb_private = private;
a7004725 6139 acb->acb_compressed = compressed_read;
440a3eb9
TC
6140 acb->acb_encrypted = encrypted_read;
6141 acb->acb_noauth = noauth_read;
be9a5c35 6142 acb->acb_zb = *zb;
34dc7c2f
BB
6143 if (pio != NULL)
6144 acb->acb_zio_dummy = zio_null(pio,
d164b209 6145 spa, NULL, NULL, NULL, zio_flags);
34dc7c2f 6146
d3c2ae1c 6147 ASSERT3P(acb->acb_done, !=, NULL);
a8b2e306 6148 acb->acb_zio_head = head_zio;
b9541d6b
CW
6149 acb->acb_next = hdr->b_l1hdr.b_acb;
6150 hdr->b_l1hdr.b_acb = acb;
34dc7c2f 6151 mutex_exit(hash_lock);
1421c891 6152 goto out;
34dc7c2f
BB
6153 }
6154 mutex_exit(hash_lock);
1421c891 6155 goto out;
34dc7c2f
BB
6156 }
6157
b9541d6b
CW
6158 ASSERT(hdr->b_l1hdr.b_state == arc_mru ||
6159 hdr->b_l1hdr.b_state == arc_mfu);
34dc7c2f
BB
6160
6161 if (done) {
7f60329a
MA
6162 if (hdr->b_flags & ARC_FLAG_PREDICTIVE_PREFETCH) {
6163 /*
6164 * This is a demand read which does not have to
6165 * wait for i/o because we did a predictive
6166 * prefetch i/o for it, which has completed.
6167 */
6168 DTRACE_PROBE1(
6169 arc__demand__hit__predictive__prefetch,
6170 arc_buf_hdr_t *, hdr);
6171 ARCSTAT_BUMP(
6172 arcstat_demand_hit_predictive_prefetch);
d3c2ae1c
GW
6173 arc_hdr_clear_flags(hdr,
6174 ARC_FLAG_PREDICTIVE_PREFETCH);
7f60329a 6175 }
d4a72f23
TC
6176
6177 if (hdr->b_flags & ARC_FLAG_PRESCIENT_PREFETCH) {
6178 ARCSTAT_BUMP(
6179 arcstat_demand_hit_prescient_prefetch);
6180 arc_hdr_clear_flags(hdr,
6181 ARC_FLAG_PRESCIENT_PREFETCH);
6182 }
6183
d3c2ae1c
GW
6184 ASSERT(!BP_IS_EMBEDDED(bp) || !BP_IS_HOLE(bp));
6185
524b4217 6186 /* Get a buf with the desired data in it. */
be9a5c35
TC
6187 rc = arc_buf_alloc_impl(hdr, spa, zb, private,
6188 encrypted_read, compressed_read, noauth_read,
6189 B_TRUE, &buf);
a2c2ed1b
TC
6190 if (rc == ECKSUM) {
6191 /*
6192 * Convert authentication and decryption errors
be9a5c35
TC
6193 * to EIO (and generate an ereport if needed)
6194 * before leaving the ARC.
a2c2ed1b
TC
6195 */
6196 rc = SET_ERROR(EIO);
be9a5c35
TC
6197 if ((zio_flags & ZIO_FLAG_SPECULATIVE) == 0) {
6198 spa_log_error(spa, zb);
6199 zfs_ereport_post(
6200 FM_EREPORT_ZFS_AUTHENTICATION,
6201 spa, NULL, zb, NULL, 0, 0);
6202 }
a2c2ed1b 6203 }
d4a72f23 6204 if (rc != 0) {
2c24b5b1
TC
6205 (void) remove_reference(hdr, hash_lock,
6206 private);
6207 arc_buf_destroy_impl(buf);
d4a72f23
TC
6208 buf = NULL;
6209 }
6210
a2c2ed1b
TC
6211 /* assert any errors weren't due to unloaded keys */
6212 ASSERT((zio_flags & ZIO_FLAG_SPECULATIVE) ||
be9a5c35 6213 rc != EACCES);
2a432414 6214 } else if (*arc_flags & ARC_FLAG_PREFETCH &&
b9541d6b 6215 refcount_count(&hdr->b_l1hdr.b_refcnt) == 0) {
d3c2ae1c 6216 arc_hdr_set_flags(hdr, ARC_FLAG_PREFETCH);
34dc7c2f
BB
6217 }
6218 DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr);
6219 arc_access(hdr, hash_lock);
d4a72f23
TC
6220 if (*arc_flags & ARC_FLAG_PRESCIENT_PREFETCH)
6221 arc_hdr_set_flags(hdr, ARC_FLAG_PRESCIENT_PREFETCH);
2a432414 6222 if (*arc_flags & ARC_FLAG_L2CACHE)
d3c2ae1c 6223 arc_hdr_set_flags(hdr, ARC_FLAG_L2CACHE);
34dc7c2f
BB
6224 mutex_exit(hash_lock);
6225 ARCSTAT_BUMP(arcstat_hits);
b9541d6b
CW
6226 ARCSTAT_CONDSTAT(!HDR_PREFETCH(hdr),
6227 demand, prefetch, !HDR_ISTYPE_METADATA(hdr),
34dc7c2f
BB
6228 data, metadata, hits);
6229
6230 if (done)
d4a72f23 6231 done(NULL, zb, bp, buf, private);
34dc7c2f 6232 } else {
d3c2ae1c
GW
6233 uint64_t lsize = BP_GET_LSIZE(bp);
6234 uint64_t psize = BP_GET_PSIZE(bp);
9b67f605 6235 arc_callback_t *acb;
b128c09f 6236 vdev_t *vd = NULL;
a117a6d6 6237 uint64_t addr = 0;
d164b209 6238 boolean_t devw = B_FALSE;
d3c2ae1c 6239 uint64_t size;
440a3eb9 6240 abd_t *hdr_abd;
34dc7c2f 6241
5f6d0b6f
BB
6242 /*
6243 * Gracefully handle a damaged logical block size as a
1cdb86cb 6244 * checksum error.
5f6d0b6f 6245 */
d3c2ae1c 6246 if (lsize > spa_maxblocksize(spa)) {
1cdb86cb 6247 rc = SET_ERROR(ECKSUM);
5f6d0b6f
BB
6248 goto out;
6249 }
6250
34dc7c2f
BB
6251 if (hdr == NULL) {
6252 /* this block is not in the cache */
9b67f605 6253 arc_buf_hdr_t *exists = NULL;
34dc7c2f 6254 arc_buf_contents_t type = BP_GET_BUFC_TYPE(bp);
d3c2ae1c 6255 hdr = arc_hdr_alloc(spa_load_guid(spa), psize, lsize,
b5256303
TC
6256 BP_IS_PROTECTED(bp), BP_GET_COMPRESS(bp), type,
6257 encrypted_read);
d3c2ae1c 6258
9b67f605
MA
6259 if (!BP_IS_EMBEDDED(bp)) {
6260 hdr->b_dva = *BP_IDENTITY(bp);
6261 hdr->b_birth = BP_PHYSICAL_BIRTH(bp);
9b67f605
MA
6262 exists = buf_hash_insert(hdr, &hash_lock);
6263 }
6264 if (exists != NULL) {
34dc7c2f
BB
6265 /* somebody beat us to the hash insert */
6266 mutex_exit(hash_lock);
428870ff 6267 buf_discard_identity(hdr);
d3c2ae1c 6268 arc_hdr_destroy(hdr);
34dc7c2f
BB
6269 goto top; /* restart the IO request */
6270 }
34dc7c2f 6271 } else {
b9541d6b 6272 /*
b5256303
TC
6273 * This block is in the ghost cache or encrypted data
6274 * was requested and we didn't have it. If it was
6275 * L2-only (and thus didn't have an L1 hdr),
6276 * we realloc the header to add an L1 hdr.
b9541d6b
CW
6277 */
6278 if (!HDR_HAS_L1HDR(hdr)) {
6279 hdr = arc_hdr_realloc(hdr, hdr_l2only_cache,
6280 hdr_full_cache);
6281 }
6282
b5256303
TC
6283 if (GHOST_STATE(hdr->b_l1hdr.b_state)) {
6284 ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL);
6285 ASSERT(!HDR_HAS_RABD(hdr));
6286 ASSERT(!HDR_IO_IN_PROGRESS(hdr));
6287 ASSERT0(refcount_count(&hdr->b_l1hdr.b_refcnt));
6288 ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL);
6289 ASSERT3P(hdr->b_l1hdr.b_freeze_cksum, ==, NULL);
6290 } else if (HDR_IO_IN_PROGRESS(hdr)) {
6291 /*
6292 * If this header already had an IO in progress
6293 * and we are performing another IO to fetch
6294 * encrypted data we must wait until the first
6295 * IO completes so as not to confuse
6296 * arc_read_done(). This should be very rare
6297 * and so the performance impact shouldn't
6298 * matter.
6299 */
6300 cv_wait(&hdr->b_l1hdr.b_cv, hash_lock);
6301 mutex_exit(hash_lock);
6302 goto top;
6303 }
34dc7c2f 6304
7f60329a 6305 /*
d3c2ae1c 6306 * This is a delicate dance that we play here.
b5256303
TC
6307 * This hdr might be in the ghost list so we access
6308 * it to move it out of the ghost list before we
d3c2ae1c
GW
6309 * initiate the read. If it's a prefetch then
6310 * it won't have a callback so we'll remove the
6311 * reference that arc_buf_alloc_impl() created. We
6312 * do this after we've called arc_access() to
6313 * avoid hitting an assert in remove_reference().
7f60329a 6314 */
428870ff 6315 arc_access(hdr, hash_lock);
b5256303 6316 arc_hdr_alloc_abd(hdr, encrypted_read);
d3c2ae1c 6317 }
d3c2ae1c 6318
b5256303
TC
6319 if (encrypted_read) {
6320 ASSERT(HDR_HAS_RABD(hdr));
6321 size = HDR_GET_PSIZE(hdr);
6322 hdr_abd = hdr->b_crypt_hdr.b_rabd;
d3c2ae1c 6323 zio_flags |= ZIO_FLAG_RAW;
b5256303
TC
6324 } else {
6325 ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL);
6326 size = arc_hdr_size(hdr);
6327 hdr_abd = hdr->b_l1hdr.b_pabd;
6328
6329 if (arc_hdr_get_compress(hdr) != ZIO_COMPRESS_OFF) {
6330 zio_flags |= ZIO_FLAG_RAW_COMPRESS;
6331 }
6332
6333 /*
6334 * For authenticated bp's, we do not ask the ZIO layer
6335 * to authenticate them since this will cause the entire
6336 * IO to fail if the key isn't loaded. Instead, we
6337 * defer authentication until arc_buf_fill(), which will
6338 * verify the data when the key is available.
6339 */
6340 if (BP_IS_AUTHENTICATED(bp))
6341 zio_flags |= ZIO_FLAG_RAW_ENCRYPT;
34dc7c2f
BB
6342 }
6343
b5256303
TC
6344 if (*arc_flags & ARC_FLAG_PREFETCH &&
6345 refcount_is_zero(&hdr->b_l1hdr.b_refcnt))
d3c2ae1c 6346 arc_hdr_set_flags(hdr, ARC_FLAG_PREFETCH);
d4a72f23
TC
6347 if (*arc_flags & ARC_FLAG_PRESCIENT_PREFETCH)
6348 arc_hdr_set_flags(hdr, ARC_FLAG_PRESCIENT_PREFETCH);
d3c2ae1c
GW
6349 if (*arc_flags & ARC_FLAG_L2CACHE)
6350 arc_hdr_set_flags(hdr, ARC_FLAG_L2CACHE);
b5256303
TC
6351 if (BP_IS_AUTHENTICATED(bp))
6352 arc_hdr_set_flags(hdr, ARC_FLAG_NOAUTH);
d3c2ae1c
GW
6353 if (BP_GET_LEVEL(bp) > 0)
6354 arc_hdr_set_flags(hdr, ARC_FLAG_INDIRECT);
7f60329a 6355 if (*arc_flags & ARC_FLAG_PREDICTIVE_PREFETCH)
d3c2ae1c 6356 arc_hdr_set_flags(hdr, ARC_FLAG_PREDICTIVE_PREFETCH);
b9541d6b 6357 ASSERT(!GHOST_STATE(hdr->b_l1hdr.b_state));
428870ff 6358
79c76d5b 6359 acb = kmem_zalloc(sizeof (arc_callback_t), KM_SLEEP);
34dc7c2f
BB
6360 acb->acb_done = done;
6361 acb->acb_private = private;
2aa34383 6362 acb->acb_compressed = compressed_read;
b5256303
TC
6363 acb->acb_encrypted = encrypted_read;
6364 acb->acb_noauth = noauth_read;
be9a5c35 6365 acb->acb_zb = *zb;
34dc7c2f 6366
d3c2ae1c 6367 ASSERT3P(hdr->b_l1hdr.b_acb, ==, NULL);
b9541d6b 6368 hdr->b_l1hdr.b_acb = acb;
d3c2ae1c 6369 arc_hdr_set_flags(hdr, ARC_FLAG_IO_IN_PROGRESS);
34dc7c2f 6370
b9541d6b
CW
6371 if (HDR_HAS_L2HDR(hdr) &&
6372 (vd = hdr->b_l2hdr.b_dev->l2ad_vdev) != NULL) {
6373 devw = hdr->b_l2hdr.b_dev->l2ad_writing;
6374 addr = hdr->b_l2hdr.b_daddr;
b128c09f 6375 /*
a1d477c2 6376 * Lock out L2ARC device removal.
b128c09f
BB
6377 */
6378 if (vdev_is_dead(vd) ||
6379 !spa_config_tryenter(spa, SCL_L2ARC, vd, RW_READER))
6380 vd = NULL;
6381 }
6382
a8b2e306
TC
6383 /*
6384 * We count both async reads and scrub IOs as asynchronous so
6385 * that both can be upgraded in the event of a cache hit while
6386 * the read IO is still in-flight.
6387 */
6388 if (priority == ZIO_PRIORITY_ASYNC_READ ||
6389 priority == ZIO_PRIORITY_SCRUB)
d3c2ae1c
GW
6390 arc_hdr_set_flags(hdr, ARC_FLAG_PRIO_ASYNC_READ);
6391 else
6392 arc_hdr_clear_flags(hdr, ARC_FLAG_PRIO_ASYNC_READ);
6393
e49f1e20
WA
6394 /*
6395 * At this point, we have a level 1 cache miss. Try again in
6396 * L2ARC if possible.
6397 */
d3c2ae1c
GW
6398 ASSERT3U(HDR_GET_LSIZE(hdr), ==, lsize);
6399
428870ff 6400 DTRACE_PROBE4(arc__miss, arc_buf_hdr_t *, hdr, blkptr_t *, bp,
d3c2ae1c 6401 uint64_t, lsize, zbookmark_phys_t *, zb);
34dc7c2f 6402 ARCSTAT_BUMP(arcstat_misses);
b9541d6b
CW
6403 ARCSTAT_CONDSTAT(!HDR_PREFETCH(hdr),
6404 demand, prefetch, !HDR_ISTYPE_METADATA(hdr),
34dc7c2f
BB
6405 data, metadata, misses);
6406
d164b209 6407 if (vd != NULL && l2arc_ndev != 0 && !(l2arc_norw && devw)) {
34dc7c2f
BB
6408 /*
6409 * Read from the L2ARC if the following are true:
b128c09f
BB
6410 * 1. The L2ARC vdev was previously cached.
6411 * 2. This buffer still has L2ARC metadata.
6412 * 3. This buffer isn't currently writing to the L2ARC.
6413 * 4. The L2ARC entry wasn't evicted, which may
6414 * also have invalidated the vdev.
d164b209 6415 * 5. This isn't prefetch and l2arc_noprefetch is set.
34dc7c2f 6416 */
b9541d6b 6417 if (HDR_HAS_L2HDR(hdr) &&
d164b209
BB
6418 !HDR_L2_WRITING(hdr) && !HDR_L2_EVICTED(hdr) &&
6419 !(l2arc_noprefetch && HDR_PREFETCH(hdr))) {
34dc7c2f 6420 l2arc_read_callback_t *cb;
82710e99
GDN
6421 abd_t *abd;
6422 uint64_t asize;
34dc7c2f
BB
6423
6424 DTRACE_PROBE1(l2arc__hit, arc_buf_hdr_t *, hdr);
6425 ARCSTAT_BUMP(arcstat_l2_hits);
b9541d6b 6426 atomic_inc_32(&hdr->b_l2hdr.b_hits);
34dc7c2f 6427
34dc7c2f 6428 cb = kmem_zalloc(sizeof (l2arc_read_callback_t),
79c76d5b 6429 KM_SLEEP);
d3c2ae1c 6430 cb->l2rcb_hdr = hdr;
34dc7c2f
BB
6431 cb->l2rcb_bp = *bp;
6432 cb->l2rcb_zb = *zb;
b128c09f 6433 cb->l2rcb_flags = zio_flags;
34dc7c2f 6434
82710e99
GDN
6435 asize = vdev_psize_to_asize(vd, size);
6436 if (asize != size) {
6437 abd = abd_alloc_for_io(asize,
6438 HDR_ISTYPE_METADATA(hdr));
6439 cb->l2rcb_abd = abd;
6440 } else {
b5256303 6441 abd = hdr_abd;
82710e99
GDN
6442 }
6443
a117a6d6 6444 ASSERT(addr >= VDEV_LABEL_START_SIZE &&
82710e99 6445 addr + asize <= vd->vdev_psize -
a117a6d6
GW
6446 VDEV_LABEL_END_SIZE);
6447
34dc7c2f 6448 /*
b128c09f
BB
6449 * l2arc read. The SCL_L2ARC lock will be
6450 * released by l2arc_read_done().
3a17a7a9
SK
6451 * Issue a null zio if the underlying buffer
6452 * was squashed to zero size by compression.
34dc7c2f 6453 */
b5256303 6454 ASSERT3U(arc_hdr_get_compress(hdr), !=,
d3c2ae1c
GW
6455 ZIO_COMPRESS_EMPTY);
6456 rzio = zio_read_phys(pio, vd, addr,
82710e99 6457 asize, abd,
d3c2ae1c
GW
6458 ZIO_CHECKSUM_OFF,
6459 l2arc_read_done, cb, priority,
6460 zio_flags | ZIO_FLAG_DONT_CACHE |
6461 ZIO_FLAG_CANFAIL |
6462 ZIO_FLAG_DONT_PROPAGATE |
6463 ZIO_FLAG_DONT_RETRY, B_FALSE);
a8b2e306
TC
6464 acb->acb_zio_head = rzio;
6465
6466 if (hash_lock != NULL)
6467 mutex_exit(hash_lock);
d3c2ae1c 6468
34dc7c2f
BB
6469 DTRACE_PROBE2(l2arc__read, vdev_t *, vd,
6470 zio_t *, rzio);
b5256303
TC
6471 ARCSTAT_INCR(arcstat_l2_read_bytes,
6472 HDR_GET_PSIZE(hdr));
34dc7c2f 6473
2a432414 6474 if (*arc_flags & ARC_FLAG_NOWAIT) {
b128c09f 6475 zio_nowait(rzio);
1421c891 6476 goto out;
b128c09f 6477 }
34dc7c2f 6478
2a432414 6479 ASSERT(*arc_flags & ARC_FLAG_WAIT);
b128c09f 6480 if (zio_wait(rzio) == 0)
1421c891 6481 goto out;
b128c09f
BB
6482
6483 /* l2arc read error; goto zio_read() */
a8b2e306
TC
6484 if (hash_lock != NULL)
6485 mutex_enter(hash_lock);
34dc7c2f
BB
6486 } else {
6487 DTRACE_PROBE1(l2arc__miss,
6488 arc_buf_hdr_t *, hdr);
6489 ARCSTAT_BUMP(arcstat_l2_misses);
6490 if (HDR_L2_WRITING(hdr))
6491 ARCSTAT_BUMP(arcstat_l2_rw_clash);
b128c09f 6492 spa_config_exit(spa, SCL_L2ARC, vd);
34dc7c2f 6493 }
d164b209
BB
6494 } else {
6495 if (vd != NULL)
6496 spa_config_exit(spa, SCL_L2ARC, vd);
6497 if (l2arc_ndev != 0) {
6498 DTRACE_PROBE1(l2arc__miss,
6499 arc_buf_hdr_t *, hdr);
6500 ARCSTAT_BUMP(arcstat_l2_misses);
6501 }
34dc7c2f 6502 }
34dc7c2f 6503
b5256303 6504 rzio = zio_read(pio, spa, bp, hdr_abd, size,
d3c2ae1c 6505 arc_read_done, hdr, priority, zio_flags, zb);
a8b2e306
TC
6506 acb->acb_zio_head = rzio;
6507
6508 if (hash_lock != NULL)
6509 mutex_exit(hash_lock);
34dc7c2f 6510
2a432414 6511 if (*arc_flags & ARC_FLAG_WAIT) {
1421c891
PS
6512 rc = zio_wait(rzio);
6513 goto out;
6514 }
34dc7c2f 6515
2a432414 6516 ASSERT(*arc_flags & ARC_FLAG_NOWAIT);
34dc7c2f
BB
6517 zio_nowait(rzio);
6518 }
1421c891
PS
6519
6520out:
157ef7f6
TC
6521 /* embedded bps don't actually go to disk */
6522 if (!BP_IS_EMBEDDED(bp))
6523 spa_read_history_add(spa, zb, *arc_flags);
1421c891 6524 return (rc);
34dc7c2f
BB
6525}
6526
ab26409d
BB
6527arc_prune_t *
6528arc_add_prune_callback(arc_prune_func_t *func, void *private)
6529{
6530 arc_prune_t *p;
6531
d1d7e268 6532 p = kmem_alloc(sizeof (*p), KM_SLEEP);
ab26409d
BB
6533 p->p_pfunc = func;
6534 p->p_private = private;
6535 list_link_init(&p->p_node);
6536 refcount_create(&p->p_refcnt);
6537
6538 mutex_enter(&arc_prune_mtx);
6539 refcount_add(&p->p_refcnt, &arc_prune_list);
6540 list_insert_head(&arc_prune_list, p);
6541 mutex_exit(&arc_prune_mtx);
6542
6543 return (p);
6544}
6545
6546void
6547arc_remove_prune_callback(arc_prune_t *p)
6548{
4442f60d 6549 boolean_t wait = B_FALSE;
ab26409d
BB
6550 mutex_enter(&arc_prune_mtx);
6551 list_remove(&arc_prune_list, p);
4442f60d
CC
6552 if (refcount_remove(&p->p_refcnt, &arc_prune_list) > 0)
6553 wait = B_TRUE;
ab26409d 6554 mutex_exit(&arc_prune_mtx);
4442f60d
CC
6555
6556 /* wait for arc_prune_task to finish */
6557 if (wait)
6558 taskq_wait_outstanding(arc_prune_taskq, 0);
6559 ASSERT0(refcount_count(&p->p_refcnt));
6560 refcount_destroy(&p->p_refcnt);
6561 kmem_free(p, sizeof (*p));
ab26409d
BB
6562}
6563
df4474f9
MA
6564/*
6565 * Notify the arc that a block was freed, and thus will never be used again.
6566 */
6567void
6568arc_freed(spa_t *spa, const blkptr_t *bp)
6569{
6570 arc_buf_hdr_t *hdr;
6571 kmutex_t *hash_lock;
6572 uint64_t guid = spa_load_guid(spa);
6573
9b67f605
MA
6574 ASSERT(!BP_IS_EMBEDDED(bp));
6575
6576 hdr = buf_hash_find(guid, bp, &hash_lock);
df4474f9
MA
6577 if (hdr == NULL)
6578 return;
df4474f9 6579
d3c2ae1c
GW
6580 /*
6581 * We might be trying to free a block that is still doing I/O
6582 * (i.e. prefetch) or has a reference (i.e. a dedup-ed,
6583 * dmu_sync-ed block). If this block is being prefetched, then it
6584 * would still have the ARC_FLAG_IO_IN_PROGRESS flag set on the hdr
6585 * until the I/O completes. A block may also have a reference if it is
6586 * part of a dedup-ed, dmu_synced write. The dmu_sync() function would
6587 * have written the new block to its final resting place on disk but
6588 * without the dedup flag set. This would have left the hdr in the MRU
6589 * state and discoverable. When the txg finally syncs it detects that
6590 * the block was overridden in open context and issues an override I/O.
6591 * Since this is a dedup block, the override I/O will determine if the
6592 * block is already in the DDT. If so, then it will replace the io_bp
6593 * with the bp from the DDT and allow the I/O to finish. When the I/O
6594 * reaches the done callback, dbuf_write_override_done, it will
6595 * check to see if the io_bp and io_bp_override are identical.
6596 * If they are not, then it indicates that the bp was replaced with
6597 * the bp in the DDT and the override bp is freed. This allows
6598 * us to arrive here with a reference on a block that is being
6599 * freed. So if we have an I/O in progress, or a reference to
6600 * this hdr, then we don't destroy the hdr.
6601 */
6602 if (!HDR_HAS_L1HDR(hdr) || (!HDR_IO_IN_PROGRESS(hdr) &&
6603 refcount_is_zero(&hdr->b_l1hdr.b_refcnt))) {
6604 arc_change_state(arc_anon, hdr, hash_lock);
6605 arc_hdr_destroy(hdr);
df4474f9 6606 mutex_exit(hash_lock);
bd089c54 6607 } else {
d3c2ae1c 6608 mutex_exit(hash_lock);
34dc7c2f 6609 }
34dc7c2f 6610
34dc7c2f
BB
6611}
6612
6613/*
e49f1e20
WA
6614 * Release this buffer from the cache, making it an anonymous buffer. This
6615 * must be done after a read and prior to modifying the buffer contents.
34dc7c2f 6616 * If the buffer has more than one reference, we must make
b128c09f 6617 * a new hdr for the buffer.
34dc7c2f
BB
6618 */
6619void
6620arc_release(arc_buf_t *buf, void *tag)
6621{
b9541d6b 6622 arc_buf_hdr_t *hdr = buf->b_hdr;
34dc7c2f 6623
428870ff 6624 /*
ca0bf58d 6625 * It would be nice to assert that if its DMU metadata (level >
428870ff
BB
6626 * 0 || it's the dnode file), then it must be syncing context.
6627 * But we don't know that information at this level.
6628 */
6629
6630 mutex_enter(&buf->b_evict_lock);
b128c09f 6631
ca0bf58d
PS
6632 ASSERT(HDR_HAS_L1HDR(hdr));
6633
b9541d6b
CW
6634 /*
6635 * We don't grab the hash lock prior to this check, because if
6636 * the buffer's header is in the arc_anon state, it won't be
6637 * linked into the hash table.
6638 */
6639 if (hdr->b_l1hdr.b_state == arc_anon) {
6640 mutex_exit(&buf->b_evict_lock);
6641 ASSERT(!HDR_IO_IN_PROGRESS(hdr));
6642 ASSERT(!HDR_IN_HASH_TABLE(hdr));
6643 ASSERT(!HDR_HAS_L2HDR(hdr));
d3c2ae1c 6644 ASSERT(HDR_EMPTY(hdr));
34dc7c2f 6645
d3c2ae1c 6646 ASSERT3U(hdr->b_l1hdr.b_bufcnt, ==, 1);
b9541d6b
CW
6647 ASSERT3S(refcount_count(&hdr->b_l1hdr.b_refcnt), ==, 1);
6648 ASSERT(!list_link_active(&hdr->b_l1hdr.b_arc_node));
6649
b9541d6b 6650 hdr->b_l1hdr.b_arc_access = 0;
d3c2ae1c
GW
6651
6652 /*
6653 * If the buf is being overridden then it may already
6654 * have a hdr that is not empty.
6655 */
6656 buf_discard_identity(hdr);
b9541d6b
CW
6657 arc_buf_thaw(buf);
6658
6659 return;
34dc7c2f
BB
6660 }
6661
1c27024e 6662 kmutex_t *hash_lock = HDR_LOCK(hdr);
b9541d6b
CW
6663 mutex_enter(hash_lock);
6664
6665 /*
6666 * This assignment is only valid as long as the hash_lock is
6667 * held, we must be careful not to reference state or the
6668 * b_state field after dropping the lock.
6669 */
1c27024e 6670 arc_state_t *state = hdr->b_l1hdr.b_state;
b9541d6b
CW
6671 ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
6672 ASSERT3P(state, !=, arc_anon);
6673
6674 /* this buffer is not on any list */
2aa34383 6675 ASSERT3S(refcount_count(&hdr->b_l1hdr.b_refcnt), >, 0);
b9541d6b
CW
6676
6677 if (HDR_HAS_L2HDR(hdr)) {
b9541d6b 6678 mutex_enter(&hdr->b_l2hdr.b_dev->l2ad_mtx);
ca0bf58d
PS
6679
6680 /*
d962d5da
PS
6681 * We have to recheck this conditional again now that
6682 * we're holding the l2ad_mtx to prevent a race with
6683 * another thread which might be concurrently calling
6684 * l2arc_evict(). In that case, l2arc_evict() might have
6685 * destroyed the header's L2 portion as we were waiting
6686 * to acquire the l2ad_mtx.
ca0bf58d 6687 */
d962d5da
PS
6688 if (HDR_HAS_L2HDR(hdr))
6689 arc_hdr_l2hdr_destroy(hdr);
ca0bf58d 6690
b9541d6b 6691 mutex_exit(&hdr->b_l2hdr.b_dev->l2ad_mtx);
b128c09f
BB
6692 }
6693
34dc7c2f
BB
6694 /*
6695 * Do we have more than one buf?
6696 */
d3c2ae1c 6697 if (hdr->b_l1hdr.b_bufcnt > 1) {
34dc7c2f 6698 arc_buf_hdr_t *nhdr;
d164b209 6699 uint64_t spa = hdr->b_spa;
d3c2ae1c
GW
6700 uint64_t psize = HDR_GET_PSIZE(hdr);
6701 uint64_t lsize = HDR_GET_LSIZE(hdr);
b5256303
TC
6702 boolean_t protected = HDR_PROTECTED(hdr);
6703 enum zio_compress compress = arc_hdr_get_compress(hdr);
b9541d6b 6704 arc_buf_contents_t type = arc_buf_type(hdr);
d3c2ae1c 6705 VERIFY3U(hdr->b_type, ==, type);
34dc7c2f 6706
b9541d6b 6707 ASSERT(hdr->b_l1hdr.b_buf != buf || buf->b_next != NULL);
d3c2ae1c
GW
6708 (void) remove_reference(hdr, hash_lock, tag);
6709
524b4217 6710 if (arc_buf_is_shared(buf) && !ARC_BUF_COMPRESSED(buf)) {
d3c2ae1c 6711 ASSERT3P(hdr->b_l1hdr.b_buf, !=, buf);
524b4217
DK
6712 ASSERT(ARC_BUF_LAST(buf));
6713 }
d3c2ae1c 6714
34dc7c2f 6715 /*
428870ff 6716 * Pull the data off of this hdr and attach it to
d3c2ae1c
GW
6717 * a new anonymous hdr. Also find the last buffer
6718 * in the hdr's buffer list.
34dc7c2f 6719 */
a7004725 6720 arc_buf_t *lastbuf = arc_buf_remove(hdr, buf);
d3c2ae1c 6721 ASSERT3P(lastbuf, !=, NULL);
34dc7c2f 6722
d3c2ae1c
GW
6723 /*
6724 * If the current arc_buf_t and the hdr are sharing their data
524b4217 6725 * buffer, then we must stop sharing that block.
d3c2ae1c
GW
6726 */
6727 if (arc_buf_is_shared(buf)) {
6728 ASSERT3P(hdr->b_l1hdr.b_buf, !=, buf);
d3c2ae1c
GW
6729 VERIFY(!arc_buf_is_shared(lastbuf));
6730
6731 /*
6732 * First, sever the block sharing relationship between
a7004725 6733 * buf and the arc_buf_hdr_t.
d3c2ae1c
GW
6734 */
6735 arc_unshare_buf(hdr, buf);
2aa34383
DK
6736
6737 /*
a6255b7f 6738 * Now we need to recreate the hdr's b_pabd. Since we
524b4217 6739 * have lastbuf handy, we try to share with it, but if
a6255b7f 6740 * we can't then we allocate a new b_pabd and copy the
524b4217 6741 * data from buf into it.
2aa34383 6742 */
524b4217
DK
6743 if (arc_can_share(hdr, lastbuf)) {
6744 arc_share_buf(hdr, lastbuf);
6745 } else {
b5256303 6746 arc_hdr_alloc_abd(hdr, B_FALSE);
a6255b7f
DQ
6747 abd_copy_from_buf(hdr->b_l1hdr.b_pabd,
6748 buf->b_data, psize);
2aa34383 6749 }
d3c2ae1c
GW
6750 VERIFY3P(lastbuf->b_data, !=, NULL);
6751 } else if (HDR_SHARED_DATA(hdr)) {
2aa34383
DK
6752 /*
6753 * Uncompressed shared buffers are always at the end
6754 * of the list. Compressed buffers don't have the
6755 * same requirements. This makes it hard to
6756 * simply assert that the lastbuf is shared so
6757 * we rely on the hdr's compression flags to determine
6758 * if we have a compressed, shared buffer.
6759 */
6760 ASSERT(arc_buf_is_shared(lastbuf) ||
b5256303 6761 arc_hdr_get_compress(hdr) != ZIO_COMPRESS_OFF);
2aa34383 6762 ASSERT(!ARC_BUF_SHARED(buf));
d3c2ae1c 6763 }
b5256303
TC
6764
6765 ASSERT(hdr->b_l1hdr.b_pabd != NULL || HDR_HAS_RABD(hdr));
b9541d6b 6766 ASSERT3P(state, !=, arc_l2c_only);
36da08ef 6767
d3c2ae1c 6768 (void) refcount_remove_many(&state->arcs_size,
2aa34383 6769 arc_buf_size(buf), buf);
36da08ef 6770
b9541d6b 6771 if (refcount_is_zero(&hdr->b_l1hdr.b_refcnt)) {
b9541d6b 6772 ASSERT3P(state, !=, arc_l2c_only);
d3c2ae1c 6773 (void) refcount_remove_many(&state->arcs_esize[type],
2aa34383 6774 arc_buf_size(buf), buf);
34dc7c2f 6775 }
1eb5bfa3 6776
d3c2ae1c 6777 hdr->b_l1hdr.b_bufcnt -= 1;
b5256303
TC
6778 if (ARC_BUF_ENCRYPTED(buf))
6779 hdr->b_crypt_hdr.b_ebufcnt -= 1;
6780
34dc7c2f 6781 arc_cksum_verify(buf);
498877ba 6782 arc_buf_unwatch(buf);
34dc7c2f 6783
f486f584
TC
6784 /* if this is the last uncompressed buf free the checksum */
6785 if (!arc_hdr_has_uncompressed_buf(hdr))
6786 arc_cksum_free(hdr);
6787
34dc7c2f
BB
6788 mutex_exit(hash_lock);
6789
d3c2ae1c 6790 /*
a6255b7f 6791 * Allocate a new hdr. The new hdr will contain a b_pabd
d3c2ae1c
GW
6792 * buffer which will be freed in arc_write().
6793 */
b5256303
TC
6794 nhdr = arc_hdr_alloc(spa, psize, lsize, protected,
6795 compress, type, HDR_HAS_RABD(hdr));
d3c2ae1c
GW
6796 ASSERT3P(nhdr->b_l1hdr.b_buf, ==, NULL);
6797 ASSERT0(nhdr->b_l1hdr.b_bufcnt);
6798 ASSERT0(refcount_count(&nhdr->b_l1hdr.b_refcnt));
6799 VERIFY3U(nhdr->b_type, ==, type);
6800 ASSERT(!HDR_SHARED_DATA(nhdr));
b9541d6b 6801
d3c2ae1c
GW
6802 nhdr->b_l1hdr.b_buf = buf;
6803 nhdr->b_l1hdr.b_bufcnt = 1;
b5256303
TC
6804 if (ARC_BUF_ENCRYPTED(buf))
6805 nhdr->b_crypt_hdr.b_ebufcnt = 1;
b9541d6b
CW
6806 nhdr->b_l1hdr.b_mru_hits = 0;
6807 nhdr->b_l1hdr.b_mru_ghost_hits = 0;
6808 nhdr->b_l1hdr.b_mfu_hits = 0;
6809 nhdr->b_l1hdr.b_mfu_ghost_hits = 0;
6810 nhdr->b_l1hdr.b_l2_hits = 0;
b9541d6b 6811 (void) refcount_add(&nhdr->b_l1hdr.b_refcnt, tag);
34dc7c2f 6812 buf->b_hdr = nhdr;
d3c2ae1c 6813
428870ff 6814 mutex_exit(&buf->b_evict_lock);
d3c2ae1c
GW
6815 (void) refcount_add_many(&arc_anon->arcs_size,
6816 HDR_GET_LSIZE(nhdr), buf);
34dc7c2f 6817 } else {
428870ff 6818 mutex_exit(&buf->b_evict_lock);
b9541d6b 6819 ASSERT(refcount_count(&hdr->b_l1hdr.b_refcnt) == 1);
ca0bf58d
PS
6820 /* protected by hash lock, or hdr is on arc_anon */
6821 ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node));
34dc7c2f 6822 ASSERT(!HDR_IO_IN_PROGRESS(hdr));
b9541d6b
CW
6823 hdr->b_l1hdr.b_mru_hits = 0;
6824 hdr->b_l1hdr.b_mru_ghost_hits = 0;
6825 hdr->b_l1hdr.b_mfu_hits = 0;
6826 hdr->b_l1hdr.b_mfu_ghost_hits = 0;
6827 hdr->b_l1hdr.b_l2_hits = 0;
6828 arc_change_state(arc_anon, hdr, hash_lock);
6829 hdr->b_l1hdr.b_arc_access = 0;
34dc7c2f 6830
b5256303 6831 mutex_exit(hash_lock);
428870ff 6832 buf_discard_identity(hdr);
34dc7c2f
BB
6833 arc_buf_thaw(buf);
6834 }
34dc7c2f
BB
6835}
6836
6837int
6838arc_released(arc_buf_t *buf)
6839{
b128c09f
BB
6840 int released;
6841
428870ff 6842 mutex_enter(&buf->b_evict_lock);
b9541d6b
CW
6843 released = (buf->b_data != NULL &&
6844 buf->b_hdr->b_l1hdr.b_state == arc_anon);
428870ff 6845 mutex_exit(&buf->b_evict_lock);
b128c09f 6846 return (released);
34dc7c2f
BB
6847}
6848
34dc7c2f
BB
6849#ifdef ZFS_DEBUG
6850int
6851arc_referenced(arc_buf_t *buf)
6852{
b128c09f
BB
6853 int referenced;
6854
428870ff 6855 mutex_enter(&buf->b_evict_lock);
b9541d6b 6856 referenced = (refcount_count(&buf->b_hdr->b_l1hdr.b_refcnt));
428870ff 6857 mutex_exit(&buf->b_evict_lock);
b128c09f 6858 return (referenced);
34dc7c2f
BB
6859}
6860#endif
6861
6862static void
6863arc_write_ready(zio_t *zio)
6864{
6865 arc_write_callback_t *callback = zio->io_private;
6866 arc_buf_t *buf = callback->awcb_buf;
6867 arc_buf_hdr_t *hdr = buf->b_hdr;
b5256303
TC
6868 blkptr_t *bp = zio->io_bp;
6869 uint64_t psize = BP_IS_HOLE(bp) ? 0 : BP_GET_PSIZE(bp);
a6255b7f 6870 fstrans_cookie_t cookie = spl_fstrans_mark();
34dc7c2f 6871
b9541d6b
CW
6872 ASSERT(HDR_HAS_L1HDR(hdr));
6873 ASSERT(!refcount_is_zero(&buf->b_hdr->b_l1hdr.b_refcnt));
d3c2ae1c 6874 ASSERT(hdr->b_l1hdr.b_bufcnt > 0);
b128c09f 6875
34dc7c2f 6876 /*
d3c2ae1c
GW
6877 * If we're reexecuting this zio because the pool suspended, then
6878 * cleanup any state that was previously set the first time the
2aa34383 6879 * callback was invoked.
34dc7c2f 6880 */
d3c2ae1c
GW
6881 if (zio->io_flags & ZIO_FLAG_REEXECUTED) {
6882 arc_cksum_free(hdr);
6883 arc_buf_unwatch(buf);
a6255b7f 6884 if (hdr->b_l1hdr.b_pabd != NULL) {
d3c2ae1c 6885 if (arc_buf_is_shared(buf)) {
d3c2ae1c
GW
6886 arc_unshare_buf(hdr, buf);
6887 } else {
b5256303 6888 arc_hdr_free_abd(hdr, B_FALSE);
d3c2ae1c 6889 }
34dc7c2f 6890 }
b5256303
TC
6891
6892 if (HDR_HAS_RABD(hdr))
6893 arc_hdr_free_abd(hdr, B_TRUE);
34dc7c2f 6894 }
a6255b7f 6895 ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL);
b5256303 6896 ASSERT(!HDR_HAS_RABD(hdr));
d3c2ae1c
GW
6897 ASSERT(!HDR_SHARED_DATA(hdr));
6898 ASSERT(!arc_buf_is_shared(buf));
6899
6900 callback->awcb_ready(zio, buf, callback->awcb_private);
6901
6902 if (HDR_IO_IN_PROGRESS(hdr))
6903 ASSERT(zio->io_flags & ZIO_FLAG_REEXECUTED);
6904
d3c2ae1c
GW
6905 arc_hdr_set_flags(hdr, ARC_FLAG_IO_IN_PROGRESS);
6906
b5256303
TC
6907 if (BP_IS_PROTECTED(bp) != !!HDR_PROTECTED(hdr))
6908 hdr = arc_hdr_realloc_crypt(hdr, BP_IS_PROTECTED(bp));
6909
6910 if (BP_IS_PROTECTED(bp)) {
6911 /* ZIL blocks are written through zio_rewrite */
6912 ASSERT3U(BP_GET_TYPE(bp), !=, DMU_OT_INTENT_LOG);
6913 ASSERT(HDR_PROTECTED(hdr));
6914
ae76f45c
TC
6915 if (BP_SHOULD_BYTESWAP(bp)) {
6916 if (BP_GET_LEVEL(bp) > 0) {
6917 hdr->b_l1hdr.b_byteswap = DMU_BSWAP_UINT64;
6918 } else {
6919 hdr->b_l1hdr.b_byteswap =
6920 DMU_OT_BYTESWAP(BP_GET_TYPE(bp));
6921 }
6922 } else {
6923 hdr->b_l1hdr.b_byteswap = DMU_BSWAP_NUMFUNCS;
6924 }
6925
b5256303
TC
6926 hdr->b_crypt_hdr.b_ot = BP_GET_TYPE(bp);
6927 hdr->b_crypt_hdr.b_dsobj = zio->io_bookmark.zb_objset;
6928 zio_crypt_decode_params_bp(bp, hdr->b_crypt_hdr.b_salt,
6929 hdr->b_crypt_hdr.b_iv);
6930 zio_crypt_decode_mac_bp(bp, hdr->b_crypt_hdr.b_mac);
6931 }
6932
6933 /*
6934 * If this block was written for raw encryption but the zio layer
6935 * ended up only authenticating it, adjust the buffer flags now.
6936 */
6937 if (BP_IS_AUTHENTICATED(bp) && ARC_BUF_ENCRYPTED(buf)) {
6938 arc_hdr_set_flags(hdr, ARC_FLAG_NOAUTH);
6939 buf->b_flags &= ~ARC_BUF_FLAG_ENCRYPTED;
6940 if (BP_GET_COMPRESS(bp) == ZIO_COMPRESS_OFF)
6941 buf->b_flags &= ~ARC_BUF_FLAG_COMPRESSED;
b1d21733
TC
6942 } else if (BP_IS_HOLE(bp) && ARC_BUF_ENCRYPTED(buf)) {
6943 buf->b_flags &= ~ARC_BUF_FLAG_ENCRYPTED;
6944 buf->b_flags &= ~ARC_BUF_FLAG_COMPRESSED;
b5256303
TC
6945 }
6946
6947 /* this must be done after the buffer flags are adjusted */
6948 arc_cksum_compute(buf);
6949
1c27024e 6950 enum zio_compress compress;
b5256303 6951 if (BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp)) {
d3c2ae1c
GW
6952 compress = ZIO_COMPRESS_OFF;
6953 } else {
b5256303
TC
6954 ASSERT3U(HDR_GET_LSIZE(hdr), ==, BP_GET_LSIZE(bp));
6955 compress = BP_GET_COMPRESS(bp);
d3c2ae1c
GW
6956 }
6957 HDR_SET_PSIZE(hdr, psize);
6958 arc_hdr_set_compress(hdr, compress);
6959
4807c0ba
TC
6960 if (zio->io_error != 0 || psize == 0)
6961 goto out;
6962
d3c2ae1c 6963 /*
b5256303
TC
6964 * Fill the hdr with data. If the buffer is encrypted we have no choice
6965 * but to copy the data into b_radb. If the hdr is compressed, the data
6966 * we want is available from the zio, otherwise we can take it from
6967 * the buf.
a6255b7f
DQ
6968 *
6969 * We might be able to share the buf's data with the hdr here. However,
6970 * doing so would cause the ARC to be full of linear ABDs if we write a
6971 * lot of shareable data. As a compromise, we check whether scattered
6972 * ABDs are allowed, and assume that if they are then the user wants
6973 * the ARC to be primarily filled with them regardless of the data being
6974 * written. Therefore, if they're allowed then we allocate one and copy
6975 * the data into it; otherwise, we share the data directly if we can.
d3c2ae1c 6976 */
b5256303 6977 if (ARC_BUF_ENCRYPTED(buf)) {
4807c0ba 6978 ASSERT3U(psize, >, 0);
b5256303
TC
6979 ASSERT(ARC_BUF_COMPRESSED(buf));
6980 arc_hdr_alloc_abd(hdr, B_TRUE);
6981 abd_copy(hdr->b_crypt_hdr.b_rabd, zio->io_abd, psize);
6982 } else if (zfs_abd_scatter_enabled || !arc_can_share(hdr, buf)) {
a6255b7f
DQ
6983 /*
6984 * Ideally, we would always copy the io_abd into b_pabd, but the
6985 * user may have disabled compressed ARC, thus we must check the
6986 * hdr's compression setting rather than the io_bp's.
6987 */
b5256303 6988 if (BP_IS_ENCRYPTED(bp)) {
a6255b7f 6989 ASSERT3U(psize, >, 0);
b5256303
TC
6990 arc_hdr_alloc_abd(hdr, B_TRUE);
6991 abd_copy(hdr->b_crypt_hdr.b_rabd, zio->io_abd, psize);
6992 } else if (arc_hdr_get_compress(hdr) != ZIO_COMPRESS_OFF &&
6993 !ARC_BUF_COMPRESSED(buf)) {
6994 ASSERT3U(psize, >, 0);
6995 arc_hdr_alloc_abd(hdr, B_FALSE);
a6255b7f
DQ
6996 abd_copy(hdr->b_l1hdr.b_pabd, zio->io_abd, psize);
6997 } else {
6998 ASSERT3U(zio->io_orig_size, ==, arc_hdr_size(hdr));
b5256303 6999 arc_hdr_alloc_abd(hdr, B_FALSE);
a6255b7f
DQ
7000 abd_copy_from_buf(hdr->b_l1hdr.b_pabd, buf->b_data,
7001 arc_buf_size(buf));
7002 }
d3c2ae1c 7003 } else {
a6255b7f 7004 ASSERT3P(buf->b_data, ==, abd_to_buf(zio->io_orig_abd));
2aa34383 7005 ASSERT3U(zio->io_orig_size, ==, arc_buf_size(buf));
d3c2ae1c 7006 ASSERT3U(hdr->b_l1hdr.b_bufcnt, ==, 1);
d3c2ae1c 7007
d3c2ae1c 7008 arc_share_buf(hdr, buf);
d3c2ae1c 7009 }
a6255b7f 7010
4807c0ba 7011out:
b5256303 7012 arc_hdr_verify(hdr, bp);
a6255b7f 7013 spl_fstrans_unmark(cookie);
34dc7c2f
BB
7014}
7015
bc77ba73
PD
7016static void
7017arc_write_children_ready(zio_t *zio)
7018{
7019 arc_write_callback_t *callback = zio->io_private;
7020 arc_buf_t *buf = callback->awcb_buf;
7021
7022 callback->awcb_children_ready(zio, buf, callback->awcb_private);
7023}
7024
e8b96c60
MA
7025/*
7026 * The SPA calls this callback for each physical write that happens on behalf
7027 * of a logical write. See the comment in dbuf_write_physdone() for details.
7028 */
7029static void
7030arc_write_physdone(zio_t *zio)
7031{
7032 arc_write_callback_t *cb = zio->io_private;
7033 if (cb->awcb_physdone != NULL)
7034 cb->awcb_physdone(zio, cb->awcb_buf, cb->awcb_private);
7035}
7036
34dc7c2f
BB
7037static void
7038arc_write_done(zio_t *zio)
7039{
7040 arc_write_callback_t *callback = zio->io_private;
7041 arc_buf_t *buf = callback->awcb_buf;
7042 arc_buf_hdr_t *hdr = buf->b_hdr;
7043
d3c2ae1c 7044 ASSERT3P(hdr->b_l1hdr.b_acb, ==, NULL);
428870ff
BB
7045
7046 if (zio->io_error == 0) {
d3c2ae1c
GW
7047 arc_hdr_verify(hdr, zio->io_bp);
7048
9b67f605 7049 if (BP_IS_HOLE(zio->io_bp) || BP_IS_EMBEDDED(zio->io_bp)) {
b0bc7a84
MG
7050 buf_discard_identity(hdr);
7051 } else {
7052 hdr->b_dva = *BP_IDENTITY(zio->io_bp);
7053 hdr->b_birth = BP_PHYSICAL_BIRTH(zio->io_bp);
b0bc7a84 7054 }
428870ff 7055 } else {
d3c2ae1c 7056 ASSERT(HDR_EMPTY(hdr));
428870ff 7057 }
34dc7c2f 7058
34dc7c2f 7059 /*
9b67f605
MA
7060 * If the block to be written was all-zero or compressed enough to be
7061 * embedded in the BP, no write was performed so there will be no
7062 * dva/birth/checksum. The buffer must therefore remain anonymous
7063 * (and uncached).
34dc7c2f 7064 */
d3c2ae1c 7065 if (!HDR_EMPTY(hdr)) {
34dc7c2f
BB
7066 arc_buf_hdr_t *exists;
7067 kmutex_t *hash_lock;
7068
524b4217 7069 ASSERT3U(zio->io_error, ==, 0);
428870ff 7070
34dc7c2f
BB
7071 arc_cksum_verify(buf);
7072
7073 exists = buf_hash_insert(hdr, &hash_lock);
b9541d6b 7074 if (exists != NULL) {
34dc7c2f
BB
7075 /*
7076 * This can only happen if we overwrite for
7077 * sync-to-convergence, because we remove
7078 * buffers from the hash table when we arc_free().
7079 */
428870ff
BB
7080 if (zio->io_flags & ZIO_FLAG_IO_REWRITE) {
7081 if (!BP_EQUAL(&zio->io_bp_orig, zio->io_bp))
7082 panic("bad overwrite, hdr=%p exists=%p",
7083 (void *)hdr, (void *)exists);
b9541d6b
CW
7084 ASSERT(refcount_is_zero(
7085 &exists->b_l1hdr.b_refcnt));
428870ff
BB
7086 arc_change_state(arc_anon, exists, hash_lock);
7087 mutex_exit(hash_lock);
7088 arc_hdr_destroy(exists);
7089 exists = buf_hash_insert(hdr, &hash_lock);
7090 ASSERT3P(exists, ==, NULL);
03c6040b
GW
7091 } else if (zio->io_flags & ZIO_FLAG_NOPWRITE) {
7092 /* nopwrite */
7093 ASSERT(zio->io_prop.zp_nopwrite);
7094 if (!BP_EQUAL(&zio->io_bp_orig, zio->io_bp))
7095 panic("bad nopwrite, hdr=%p exists=%p",
7096 (void *)hdr, (void *)exists);
428870ff
BB
7097 } else {
7098 /* Dedup */
d3c2ae1c 7099 ASSERT(hdr->b_l1hdr.b_bufcnt == 1);
b9541d6b 7100 ASSERT(hdr->b_l1hdr.b_state == arc_anon);
428870ff
BB
7101 ASSERT(BP_GET_DEDUP(zio->io_bp));
7102 ASSERT(BP_GET_LEVEL(zio->io_bp) == 0);
7103 }
34dc7c2f 7104 }
d3c2ae1c 7105 arc_hdr_clear_flags(hdr, ARC_FLAG_IO_IN_PROGRESS);
b128c09f 7106 /* if it's not anon, we are doing a scrub */
b9541d6b 7107 if (exists == NULL && hdr->b_l1hdr.b_state == arc_anon)
b128c09f 7108 arc_access(hdr, hash_lock);
34dc7c2f 7109 mutex_exit(hash_lock);
34dc7c2f 7110 } else {
d3c2ae1c 7111 arc_hdr_clear_flags(hdr, ARC_FLAG_IO_IN_PROGRESS);
34dc7c2f
BB
7112 }
7113
b9541d6b 7114 ASSERT(!refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
428870ff 7115 callback->awcb_done(zio, buf, callback->awcb_private);
34dc7c2f 7116
a6255b7f 7117 abd_put(zio->io_abd);
34dc7c2f
BB
7118 kmem_free(callback, sizeof (arc_write_callback_t));
7119}
7120
7121zio_t *
428870ff 7122arc_write(zio_t *pio, spa_t *spa, uint64_t txg,
d3c2ae1c 7123 blkptr_t *bp, arc_buf_t *buf, boolean_t l2arc,
b5256303
TC
7124 const zio_prop_t *zp, arc_write_done_func_t *ready,
7125 arc_write_done_func_t *children_ready, arc_write_done_func_t *physdone,
7126 arc_write_done_func_t *done, void *private, zio_priority_t priority,
5dbd68a3 7127 int zio_flags, const zbookmark_phys_t *zb)
34dc7c2f
BB
7128{
7129 arc_buf_hdr_t *hdr = buf->b_hdr;
7130 arc_write_callback_t *callback;
b128c09f 7131 zio_t *zio;
82644107 7132 zio_prop_t localprop = *zp;
34dc7c2f 7133
d3c2ae1c
GW
7134 ASSERT3P(ready, !=, NULL);
7135 ASSERT3P(done, !=, NULL);
34dc7c2f 7136 ASSERT(!HDR_IO_ERROR(hdr));
b9541d6b 7137 ASSERT(!HDR_IO_IN_PROGRESS(hdr));
d3c2ae1c
GW
7138 ASSERT3P(hdr->b_l1hdr.b_acb, ==, NULL);
7139 ASSERT3U(hdr->b_l1hdr.b_bufcnt, >, 0);
b128c09f 7140 if (l2arc)
d3c2ae1c 7141 arc_hdr_set_flags(hdr, ARC_FLAG_L2CACHE);
82644107 7142
b5256303
TC
7143 if (ARC_BUF_ENCRYPTED(buf)) {
7144 ASSERT(ARC_BUF_COMPRESSED(buf));
7145 localprop.zp_encrypt = B_TRUE;
7146 localprop.zp_compress = HDR_GET_COMPRESS(hdr);
7147 localprop.zp_byteorder =
7148 (hdr->b_l1hdr.b_byteswap == DMU_BSWAP_NUMFUNCS) ?
7149 ZFS_HOST_BYTEORDER : !ZFS_HOST_BYTEORDER;
7150 bcopy(hdr->b_crypt_hdr.b_salt, localprop.zp_salt,
7151 ZIO_DATA_SALT_LEN);
7152 bcopy(hdr->b_crypt_hdr.b_iv, localprop.zp_iv,
7153 ZIO_DATA_IV_LEN);
7154 bcopy(hdr->b_crypt_hdr.b_mac, localprop.zp_mac,
7155 ZIO_DATA_MAC_LEN);
7156 if (DMU_OT_IS_ENCRYPTED(localprop.zp_type)) {
7157 localprop.zp_nopwrite = B_FALSE;
7158 localprop.zp_copies =
7159 MIN(localprop.zp_copies, SPA_DVAS_PER_BP - 1);
7160 }
2aa34383 7161 zio_flags |= ZIO_FLAG_RAW;
b5256303
TC
7162 } else if (ARC_BUF_COMPRESSED(buf)) {
7163 ASSERT3U(HDR_GET_LSIZE(hdr), !=, arc_buf_size(buf));
7164 localprop.zp_compress = HDR_GET_COMPRESS(hdr);
7165 zio_flags |= ZIO_FLAG_RAW_COMPRESS;
2aa34383 7166 }
79c76d5b 7167 callback = kmem_zalloc(sizeof (arc_write_callback_t), KM_SLEEP);
34dc7c2f 7168 callback->awcb_ready = ready;
bc77ba73 7169 callback->awcb_children_ready = children_ready;
e8b96c60 7170 callback->awcb_physdone = physdone;
34dc7c2f
BB
7171 callback->awcb_done = done;
7172 callback->awcb_private = private;
7173 callback->awcb_buf = buf;
b128c09f 7174
d3c2ae1c 7175 /*
a6255b7f 7176 * The hdr's b_pabd is now stale, free it now. A new data block
d3c2ae1c
GW
7177 * will be allocated when the zio pipeline calls arc_write_ready().
7178 */
a6255b7f 7179 if (hdr->b_l1hdr.b_pabd != NULL) {
d3c2ae1c
GW
7180 /*
7181 * If the buf is currently sharing the data block with
7182 * the hdr then we need to break that relationship here.
7183 * The hdr will remain with a NULL data pointer and the
7184 * buf will take sole ownership of the block.
7185 */
7186 if (arc_buf_is_shared(buf)) {
d3c2ae1c
GW
7187 arc_unshare_buf(hdr, buf);
7188 } else {
b5256303 7189 arc_hdr_free_abd(hdr, B_FALSE);
d3c2ae1c
GW
7190 }
7191 VERIFY3P(buf->b_data, !=, NULL);
d3c2ae1c 7192 }
b5256303
TC
7193
7194 if (HDR_HAS_RABD(hdr))
7195 arc_hdr_free_abd(hdr, B_TRUE);
7196
71a24c3c
TC
7197 if (!(zio_flags & ZIO_FLAG_RAW))
7198 arc_hdr_set_compress(hdr, ZIO_COMPRESS_OFF);
b5256303 7199
d3c2ae1c 7200 ASSERT(!arc_buf_is_shared(buf));
a6255b7f 7201 ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL);
d3c2ae1c 7202
a6255b7f
DQ
7203 zio = zio_write(pio, spa, txg, bp,
7204 abd_get_from_buf(buf->b_data, HDR_GET_LSIZE(hdr)),
82644107 7205 HDR_GET_LSIZE(hdr), arc_buf_size(buf), &localprop, arc_write_ready,
bc77ba73
PD
7206 (children_ready != NULL) ? arc_write_children_ready : NULL,
7207 arc_write_physdone, arc_write_done, callback,
e8b96c60 7208 priority, zio_flags, zb);
34dc7c2f
BB
7209
7210 return (zio);
7211}
7212
34dc7c2f 7213static int
dae3e9ea 7214arc_memory_throttle(spa_t *spa, uint64_t reserve, uint64_t txg)
34dc7c2f
BB
7215{
7216#ifdef _KERNEL
70f02287 7217 uint64_t available_memory = arc_free_memory();
0c5493d4 7218
70f02287 7219#if defined(_ILP32)
9edb3695
BB
7220 available_memory =
7221 MIN(available_memory, vmem_size(heap_arena, VMEM_FREE));
7222#endif
7223
7224 if (available_memory > arc_all_memory() * arc_lotsfree_percent / 100)
ca67b33a
MA
7225 return (0);
7226
dae3e9ea
DB
7227 if (txg > spa->spa_lowmem_last_txg) {
7228 spa->spa_lowmem_last_txg = txg;
7229 spa->spa_lowmem_page_load = 0;
7e8bddd0 7230 }
7e8bddd0
BB
7231 /*
7232 * If we are in pageout, we know that memory is already tight,
7233 * the arc is already going to be evicting, so we just want to
7234 * continue to let page writes occur as quickly as possible.
7235 */
7236 if (current_is_kswapd()) {
dae3e9ea
DB
7237 if (spa->spa_lowmem_page_load >
7238 MAX(arc_sys_free / 4, available_memory) / 4) {
7e8bddd0
BB
7239 DMU_TX_STAT_BUMP(dmu_tx_memory_reclaim);
7240 return (SET_ERROR(ERESTART));
7241 }
7242 /* Note: reserve is inflated, so we deflate */
dae3e9ea 7243 atomic_add_64(&spa->spa_lowmem_page_load, reserve / 8);
7e8bddd0 7244 return (0);
dae3e9ea 7245 } else if (spa->spa_lowmem_page_load > 0 && arc_reclaim_needed()) {
ca67b33a 7246 /* memory is low, delay before restarting */
34dc7c2f 7247 ARCSTAT_INCR(arcstat_memory_throttle_count, 1);
570827e1 7248 DMU_TX_STAT_BUMP(dmu_tx_memory_reclaim);
2e528b49 7249 return (SET_ERROR(EAGAIN));
34dc7c2f 7250 }
dae3e9ea
DB
7251 spa->spa_lowmem_page_load = 0;
7252#endif /* _KERNEL */
34dc7c2f
BB
7253 return (0);
7254}
7255
7256void
7257arc_tempreserve_clear(uint64_t reserve)
7258{
7259 atomic_add_64(&arc_tempreserve, -reserve);
7260 ASSERT((int64_t)arc_tempreserve >= 0);
7261}
7262
7263int
dae3e9ea 7264arc_tempreserve_space(spa_t *spa, uint64_t reserve, uint64_t txg)
34dc7c2f
BB
7265{
7266 int error;
9babb374 7267 uint64_t anon_size;
34dc7c2f 7268
1b8951b3
TC
7269 if (!arc_no_grow &&
7270 reserve > arc_c/4 &&
7271 reserve * 4 > (2ULL << SPA_MAXBLOCKSHIFT))
34dc7c2f 7272 arc_c = MIN(arc_c_max, reserve * 4);
12f9a6a3
BB
7273
7274 /*
7275 * Throttle when the calculated memory footprint for the TXG
7276 * exceeds the target ARC size.
7277 */
570827e1
BB
7278 if (reserve > arc_c) {
7279 DMU_TX_STAT_BUMP(dmu_tx_memory_reserve);
12f9a6a3 7280 return (SET_ERROR(ERESTART));
570827e1 7281 }
34dc7c2f 7282
9babb374
BB
7283 /*
7284 * Don't count loaned bufs as in flight dirty data to prevent long
7285 * network delays from blocking transactions that are ready to be
7286 * assigned to a txg.
7287 */
a7004725
DK
7288
7289 /* assert that it has not wrapped around */
7290 ASSERT3S(atomic_add_64_nv(&arc_loaned_bytes, 0), >=, 0);
7291
36da08ef
PS
7292 anon_size = MAX((int64_t)(refcount_count(&arc_anon->arcs_size) -
7293 arc_loaned_bytes), 0);
9babb374 7294
34dc7c2f
BB
7295 /*
7296 * Writes will, almost always, require additional memory allocations
d3cc8b15 7297 * in order to compress/encrypt/etc the data. We therefore need to
34dc7c2f
BB
7298 * make sure that there is sufficient available memory for this.
7299 */
dae3e9ea 7300 error = arc_memory_throttle(spa, reserve, txg);
e8b96c60 7301 if (error != 0)
34dc7c2f
BB
7302 return (error);
7303
7304 /*
7305 * Throttle writes when the amount of dirty data in the cache
7306 * gets too large. We try to keep the cache less than half full
7307 * of dirty blocks so that our sync times don't grow too large.
dae3e9ea
DB
7308 *
7309 * In the case of one pool being built on another pool, we want
7310 * to make sure we don't end up throttling the lower (backing)
7311 * pool when the upper pool is the majority contributor to dirty
7312 * data. To insure we make forward progress during throttling, we
7313 * also check the current pool's net dirty data and only throttle
7314 * if it exceeds zfs_arc_pool_dirty_percent of the anonymous dirty
7315 * data in the cache.
7316 *
34dc7c2f
BB
7317 * Note: if two requests come in concurrently, we might let them
7318 * both succeed, when one of them should fail. Not a huge deal.
7319 */
dae3e9ea
DB
7320 uint64_t total_dirty = reserve + arc_tempreserve + anon_size;
7321 uint64_t spa_dirty_anon = spa_dirty_data(spa);
9babb374 7322
dae3e9ea
DB
7323 if (total_dirty > arc_c * zfs_arc_dirty_limit_percent / 100 &&
7324 anon_size > arc_c * zfs_arc_anon_limit_percent / 100 &&
7325 spa_dirty_anon > anon_size * zfs_arc_pool_dirty_percent / 100) {
2fd92c3d 7326#ifdef ZFS_DEBUG
d3c2ae1c
GW
7327 uint64_t meta_esize =
7328 refcount_count(&arc_anon->arcs_esize[ARC_BUFC_METADATA]);
7329 uint64_t data_esize =
7330 refcount_count(&arc_anon->arcs_esize[ARC_BUFC_DATA]);
34dc7c2f
BB
7331 dprintf("failing, arc_tempreserve=%lluK anon_meta=%lluK "
7332 "anon_data=%lluK tempreserve=%lluK arc_c=%lluK\n",
d3c2ae1c
GW
7333 arc_tempreserve >> 10, meta_esize >> 10,
7334 data_esize >> 10, reserve >> 10, arc_c >> 10);
2fd92c3d 7335#endif
570827e1 7336 DMU_TX_STAT_BUMP(dmu_tx_dirty_throttle);
2e528b49 7337 return (SET_ERROR(ERESTART));
34dc7c2f
BB
7338 }
7339 atomic_add_64(&arc_tempreserve, reserve);
7340 return (0);
7341}
7342
13be560d
BB
7343static void
7344arc_kstat_update_state(arc_state_t *state, kstat_named_t *size,
7345 kstat_named_t *evict_data, kstat_named_t *evict_metadata)
7346{
36da08ef 7347 size->value.ui64 = refcount_count(&state->arcs_size);
d3c2ae1c
GW
7348 evict_data->value.ui64 =
7349 refcount_count(&state->arcs_esize[ARC_BUFC_DATA]);
7350 evict_metadata->value.ui64 =
7351 refcount_count(&state->arcs_esize[ARC_BUFC_METADATA]);
13be560d
BB
7352}
7353
7354static int
7355arc_kstat_update(kstat_t *ksp, int rw)
7356{
7357 arc_stats_t *as = ksp->ks_data;
7358
7359 if (rw == KSTAT_WRITE) {
ecb2b7dc 7360 return (SET_ERROR(EACCES));
13be560d
BB
7361 } else {
7362 arc_kstat_update_state(arc_anon,
7363 &as->arcstat_anon_size,
500445c0
PS
7364 &as->arcstat_anon_evictable_data,
7365 &as->arcstat_anon_evictable_metadata);
13be560d
BB
7366 arc_kstat_update_state(arc_mru,
7367 &as->arcstat_mru_size,
500445c0
PS
7368 &as->arcstat_mru_evictable_data,
7369 &as->arcstat_mru_evictable_metadata);
13be560d
BB
7370 arc_kstat_update_state(arc_mru_ghost,
7371 &as->arcstat_mru_ghost_size,
500445c0
PS
7372 &as->arcstat_mru_ghost_evictable_data,
7373 &as->arcstat_mru_ghost_evictable_metadata);
13be560d
BB
7374 arc_kstat_update_state(arc_mfu,
7375 &as->arcstat_mfu_size,
500445c0
PS
7376 &as->arcstat_mfu_evictable_data,
7377 &as->arcstat_mfu_evictable_metadata);
fc41c640 7378 arc_kstat_update_state(arc_mfu_ghost,
13be560d 7379 &as->arcstat_mfu_ghost_size,
500445c0
PS
7380 &as->arcstat_mfu_ghost_evictable_data,
7381 &as->arcstat_mfu_ghost_evictable_metadata);
70f02287 7382
37fb3e43
PD
7383 ARCSTAT(arcstat_size) = aggsum_value(&arc_size);
7384 ARCSTAT(arcstat_meta_used) = aggsum_value(&arc_meta_used);
7385 ARCSTAT(arcstat_data_size) = aggsum_value(&astat_data_size);
7386 ARCSTAT(arcstat_metadata_size) =
7387 aggsum_value(&astat_metadata_size);
7388 ARCSTAT(arcstat_hdr_size) = aggsum_value(&astat_hdr_size);
7389 ARCSTAT(arcstat_l2_hdr_size) = aggsum_value(&astat_l2_hdr_size);
7390 ARCSTAT(arcstat_dbuf_size) = aggsum_value(&astat_dbuf_size);
7391 ARCSTAT(arcstat_dnode_size) = aggsum_value(&astat_dnode_size);
7392 ARCSTAT(arcstat_bonus_size) = aggsum_value(&astat_bonus_size);
7393
70f02287
BB
7394 as->arcstat_memory_all_bytes.value.ui64 =
7395 arc_all_memory();
7396 as->arcstat_memory_free_bytes.value.ui64 =
7397 arc_free_memory();
7398 as->arcstat_memory_available_bytes.value.i64 =
7399 arc_available_memory();
13be560d
BB
7400 }
7401
7402 return (0);
7403}
7404
ca0bf58d
PS
7405/*
7406 * This function *must* return indices evenly distributed between all
7407 * sublists of the multilist. This is needed due to how the ARC eviction
7408 * code is laid out; arc_evict_state() assumes ARC buffers are evenly
7409 * distributed between all sublists and uses this assumption when
7410 * deciding which sublist to evict from and how much to evict from it.
7411 */
7412unsigned int
7413arc_state_multilist_index_func(multilist_t *ml, void *obj)
7414{
7415 arc_buf_hdr_t *hdr = obj;
7416
7417 /*
7418 * We rely on b_dva to generate evenly distributed index
7419 * numbers using buf_hash below. So, as an added precaution,
7420 * let's make sure we never add empty buffers to the arc lists.
7421 */
d3c2ae1c 7422 ASSERT(!HDR_EMPTY(hdr));
ca0bf58d
PS
7423
7424 /*
7425 * The assumption here, is the hash value for a given
7426 * arc_buf_hdr_t will remain constant throughout its lifetime
7427 * (i.e. its b_spa, b_dva, and b_birth fields don't change).
7428 * Thus, we don't need to store the header's sublist index
7429 * on insertion, as this index can be recalculated on removal.
7430 *
7431 * Also, the low order bits of the hash value are thought to be
7432 * distributed evenly. Otherwise, in the case that the multilist
7433 * has a power of two number of sublists, each sublists' usage
7434 * would not be evenly distributed.
7435 */
7436 return (buf_hash(hdr->b_spa, &hdr->b_dva, hdr->b_birth) %
7437 multilist_get_num_sublists(ml));
7438}
7439
ca67b33a
MA
7440/*
7441 * Called during module initialization and periodically thereafter to
7442 * apply reasonable changes to the exposed performance tunings. Non-zero
7443 * zfs_* values which differ from the currently set values will be applied.
7444 */
7445static void
7446arc_tuning_update(void)
7447{
b8a97fb1 7448 uint64_t allmem = arc_all_memory();
7449 unsigned long limit;
9edb3695 7450
ca67b33a
MA
7451 /* Valid range: 64M - <all physical memory> */
7452 if ((zfs_arc_max) && (zfs_arc_max != arc_c_max) &&
7403d074 7453 (zfs_arc_max >= 64 << 20) && (zfs_arc_max < allmem) &&
ca67b33a
MA
7454 (zfs_arc_max > arc_c_min)) {
7455 arc_c_max = zfs_arc_max;
7456 arc_c = arc_c_max;
7457 arc_p = (arc_c >> 1);
b8a97fb1 7458 if (arc_meta_limit > arc_c_max)
7459 arc_meta_limit = arc_c_max;
7460 if (arc_dnode_limit > arc_meta_limit)
7461 arc_dnode_limit = arc_meta_limit;
ca67b33a
MA
7462 }
7463
7464 /* Valid range: 32M - <arc_c_max> */
7465 if ((zfs_arc_min) && (zfs_arc_min != arc_c_min) &&
7466 (zfs_arc_min >= 2ULL << SPA_MAXBLOCKSHIFT) &&
7467 (zfs_arc_min <= arc_c_max)) {
7468 arc_c_min = zfs_arc_min;
7469 arc_c = MAX(arc_c, arc_c_min);
7470 }
7471
7472 /* Valid range: 16M - <arc_c_max> */
7473 if ((zfs_arc_meta_min) && (zfs_arc_meta_min != arc_meta_min) &&
7474 (zfs_arc_meta_min >= 1ULL << SPA_MAXBLOCKSHIFT) &&
7475 (zfs_arc_meta_min <= arc_c_max)) {
7476 arc_meta_min = zfs_arc_meta_min;
b8a97fb1 7477 if (arc_meta_limit < arc_meta_min)
7478 arc_meta_limit = arc_meta_min;
7479 if (arc_dnode_limit < arc_meta_min)
7480 arc_dnode_limit = arc_meta_min;
ca67b33a
MA
7481 }
7482
7483 /* Valid range: <arc_meta_min> - <arc_c_max> */
b8a97fb1 7484 limit = zfs_arc_meta_limit ? zfs_arc_meta_limit :
7485 MIN(zfs_arc_meta_limit_percent, 100) * arc_c_max / 100;
7486 if ((limit != arc_meta_limit) &&
7487 (limit >= arc_meta_min) &&
7488 (limit <= arc_c_max))
7489 arc_meta_limit = limit;
7490
7491 /* Valid range: <arc_meta_min> - <arc_meta_limit> */
7492 limit = zfs_arc_dnode_limit ? zfs_arc_dnode_limit :
7493 MIN(zfs_arc_dnode_limit_percent, 100) * arc_meta_limit / 100;
7494 if ((limit != arc_dnode_limit) &&
7495 (limit >= arc_meta_min) &&
7496 (limit <= arc_meta_limit))
7497 arc_dnode_limit = limit;
25458cbe 7498
ca67b33a
MA
7499 /* Valid range: 1 - N */
7500 if (zfs_arc_grow_retry)
7501 arc_grow_retry = zfs_arc_grow_retry;
7502
7503 /* Valid range: 1 - N */
7504 if (zfs_arc_shrink_shift) {
7505 arc_shrink_shift = zfs_arc_shrink_shift;
7506 arc_no_grow_shift = MIN(arc_no_grow_shift, arc_shrink_shift -1);
7507 }
7508
728d6ae9
BB
7509 /* Valid range: 1 - N */
7510 if (zfs_arc_p_min_shift)
7511 arc_p_min_shift = zfs_arc_p_min_shift;
7512
d4a72f23
TC
7513 /* Valid range: 1 - N ms */
7514 if (zfs_arc_min_prefetch_ms)
7515 arc_min_prefetch_ms = zfs_arc_min_prefetch_ms;
7516
7517 /* Valid range: 1 - N ms */
7518 if (zfs_arc_min_prescient_prefetch_ms) {
7519 arc_min_prescient_prefetch_ms =
7520 zfs_arc_min_prescient_prefetch_ms;
7521 }
11f552fa 7522
7e8bddd0
BB
7523 /* Valid range: 0 - 100 */
7524 if ((zfs_arc_lotsfree_percent >= 0) &&
7525 (zfs_arc_lotsfree_percent <= 100))
7526 arc_lotsfree_percent = zfs_arc_lotsfree_percent;
7527
11f552fa
BB
7528 /* Valid range: 0 - <all physical memory> */
7529 if ((zfs_arc_sys_free) && (zfs_arc_sys_free != arc_sys_free))
9edb3695 7530 arc_sys_free = MIN(MAX(zfs_arc_sys_free, 0), allmem);
7e8bddd0 7531
ca67b33a
MA
7532}
7533
d3c2ae1c
GW
7534static void
7535arc_state_init(void)
7536{
7537 arc_anon = &ARC_anon;
7538 arc_mru = &ARC_mru;
7539 arc_mru_ghost = &ARC_mru_ghost;
7540 arc_mfu = &ARC_mfu;
7541 arc_mfu_ghost = &ARC_mfu_ghost;
7542 arc_l2c_only = &ARC_l2c_only;
7543
64fc7762
MA
7544 arc_mru->arcs_list[ARC_BUFC_METADATA] =
7545 multilist_create(sizeof (arc_buf_hdr_t),
d3c2ae1c 7546 offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node),
c30e58c4 7547 arc_state_multilist_index_func);
64fc7762
MA
7548 arc_mru->arcs_list[ARC_BUFC_DATA] =
7549 multilist_create(sizeof (arc_buf_hdr_t),
d3c2ae1c 7550 offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node),
c30e58c4 7551 arc_state_multilist_index_func);
64fc7762
MA
7552 arc_mru_ghost->arcs_list[ARC_BUFC_METADATA] =
7553 multilist_create(sizeof (arc_buf_hdr_t),
d3c2ae1c 7554 offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node),
c30e58c4 7555 arc_state_multilist_index_func);
64fc7762
MA
7556 arc_mru_ghost->arcs_list[ARC_BUFC_DATA] =
7557 multilist_create(sizeof (arc_buf_hdr_t),
d3c2ae1c 7558 offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node),
c30e58c4 7559 arc_state_multilist_index_func);
64fc7762
MA
7560 arc_mfu->arcs_list[ARC_BUFC_METADATA] =
7561 multilist_create(sizeof (arc_buf_hdr_t),
d3c2ae1c 7562 offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node),
c30e58c4 7563 arc_state_multilist_index_func);
64fc7762
MA
7564 arc_mfu->arcs_list[ARC_BUFC_DATA] =
7565 multilist_create(sizeof (arc_buf_hdr_t),
d3c2ae1c 7566 offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node),
c30e58c4 7567 arc_state_multilist_index_func);
64fc7762
MA
7568 arc_mfu_ghost->arcs_list[ARC_BUFC_METADATA] =
7569 multilist_create(sizeof (arc_buf_hdr_t),
d3c2ae1c 7570 offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node),
c30e58c4 7571 arc_state_multilist_index_func);
64fc7762
MA
7572 arc_mfu_ghost->arcs_list[ARC_BUFC_DATA] =
7573 multilist_create(sizeof (arc_buf_hdr_t),
d3c2ae1c 7574 offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node),
c30e58c4 7575 arc_state_multilist_index_func);
64fc7762
MA
7576 arc_l2c_only->arcs_list[ARC_BUFC_METADATA] =
7577 multilist_create(sizeof (arc_buf_hdr_t),
d3c2ae1c 7578 offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node),
c30e58c4 7579 arc_state_multilist_index_func);
64fc7762
MA
7580 arc_l2c_only->arcs_list[ARC_BUFC_DATA] =
7581 multilist_create(sizeof (arc_buf_hdr_t),
d3c2ae1c 7582 offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node),
c30e58c4 7583 arc_state_multilist_index_func);
d3c2ae1c
GW
7584
7585 refcount_create(&arc_anon->arcs_esize[ARC_BUFC_METADATA]);
7586 refcount_create(&arc_anon->arcs_esize[ARC_BUFC_DATA]);
7587 refcount_create(&arc_mru->arcs_esize[ARC_BUFC_METADATA]);
7588 refcount_create(&arc_mru->arcs_esize[ARC_BUFC_DATA]);
7589 refcount_create(&arc_mru_ghost->arcs_esize[ARC_BUFC_METADATA]);
7590 refcount_create(&arc_mru_ghost->arcs_esize[ARC_BUFC_DATA]);
7591 refcount_create(&arc_mfu->arcs_esize[ARC_BUFC_METADATA]);
7592 refcount_create(&arc_mfu->arcs_esize[ARC_BUFC_DATA]);
7593 refcount_create(&arc_mfu_ghost->arcs_esize[ARC_BUFC_METADATA]);
7594 refcount_create(&arc_mfu_ghost->arcs_esize[ARC_BUFC_DATA]);
7595 refcount_create(&arc_l2c_only->arcs_esize[ARC_BUFC_METADATA]);
7596 refcount_create(&arc_l2c_only->arcs_esize[ARC_BUFC_DATA]);
7597
7598 refcount_create(&arc_anon->arcs_size);
7599 refcount_create(&arc_mru->arcs_size);
7600 refcount_create(&arc_mru_ghost->arcs_size);
7601 refcount_create(&arc_mfu->arcs_size);
7602 refcount_create(&arc_mfu_ghost->arcs_size);
7603 refcount_create(&arc_l2c_only->arcs_size);
7604
37fb3e43
PD
7605 aggsum_init(&arc_meta_used, 0);
7606 aggsum_init(&arc_size, 0);
7607 aggsum_init(&astat_data_size, 0);
7608 aggsum_init(&astat_metadata_size, 0);
7609 aggsum_init(&astat_hdr_size, 0);
7610 aggsum_init(&astat_l2_hdr_size, 0);
7611 aggsum_init(&astat_bonus_size, 0);
7612 aggsum_init(&astat_dnode_size, 0);
7613 aggsum_init(&astat_dbuf_size, 0);
7614
d3c2ae1c
GW
7615 arc_anon->arcs_state = ARC_STATE_ANON;
7616 arc_mru->arcs_state = ARC_STATE_MRU;
7617 arc_mru_ghost->arcs_state = ARC_STATE_MRU_GHOST;
7618 arc_mfu->arcs_state = ARC_STATE_MFU;
7619 arc_mfu_ghost->arcs_state = ARC_STATE_MFU_GHOST;
7620 arc_l2c_only->arcs_state = ARC_STATE_L2C_ONLY;
7621}
7622
7623static void
7624arc_state_fini(void)
7625{
7626 refcount_destroy(&arc_anon->arcs_esize[ARC_BUFC_METADATA]);
7627 refcount_destroy(&arc_anon->arcs_esize[ARC_BUFC_DATA]);
7628 refcount_destroy(&arc_mru->arcs_esize[ARC_BUFC_METADATA]);
7629 refcount_destroy(&arc_mru->arcs_esize[ARC_BUFC_DATA]);
7630 refcount_destroy(&arc_mru_ghost->arcs_esize[ARC_BUFC_METADATA]);
7631 refcount_destroy(&arc_mru_ghost->arcs_esize[ARC_BUFC_DATA]);
7632 refcount_destroy(&arc_mfu->arcs_esize[ARC_BUFC_METADATA]);
7633 refcount_destroy(&arc_mfu->arcs_esize[ARC_BUFC_DATA]);
7634 refcount_destroy(&arc_mfu_ghost->arcs_esize[ARC_BUFC_METADATA]);
7635 refcount_destroy(&arc_mfu_ghost->arcs_esize[ARC_BUFC_DATA]);
7636 refcount_destroy(&arc_l2c_only->arcs_esize[ARC_BUFC_METADATA]);
7637 refcount_destroy(&arc_l2c_only->arcs_esize[ARC_BUFC_DATA]);
7638
7639 refcount_destroy(&arc_anon->arcs_size);
7640 refcount_destroy(&arc_mru->arcs_size);
7641 refcount_destroy(&arc_mru_ghost->arcs_size);
7642 refcount_destroy(&arc_mfu->arcs_size);
7643 refcount_destroy(&arc_mfu_ghost->arcs_size);
7644 refcount_destroy(&arc_l2c_only->arcs_size);
7645
64fc7762
MA
7646 multilist_destroy(arc_mru->arcs_list[ARC_BUFC_METADATA]);
7647 multilist_destroy(arc_mru_ghost->arcs_list[ARC_BUFC_METADATA]);
7648 multilist_destroy(arc_mfu->arcs_list[ARC_BUFC_METADATA]);
7649 multilist_destroy(arc_mfu_ghost->arcs_list[ARC_BUFC_METADATA]);
7650 multilist_destroy(arc_mru->arcs_list[ARC_BUFC_DATA]);
7651 multilist_destroy(arc_mru_ghost->arcs_list[ARC_BUFC_DATA]);
7652 multilist_destroy(arc_mfu->arcs_list[ARC_BUFC_DATA]);
7653 multilist_destroy(arc_mfu_ghost->arcs_list[ARC_BUFC_DATA]);
7654 multilist_destroy(arc_l2c_only->arcs_list[ARC_BUFC_METADATA]);
7655 multilist_destroy(arc_l2c_only->arcs_list[ARC_BUFC_DATA]);
37fb3e43
PD
7656
7657 aggsum_fini(&arc_meta_used);
7658 aggsum_fini(&arc_size);
7659 aggsum_fini(&astat_data_size);
7660 aggsum_fini(&astat_metadata_size);
7661 aggsum_fini(&astat_hdr_size);
7662 aggsum_fini(&astat_l2_hdr_size);
7663 aggsum_fini(&astat_bonus_size);
7664 aggsum_fini(&astat_dnode_size);
7665 aggsum_fini(&astat_dbuf_size);
d3c2ae1c
GW
7666}
7667
7668uint64_t
e71cade6 7669arc_target_bytes(void)
d3c2ae1c 7670{
e71cade6 7671 return (arc_c);
d3c2ae1c
GW
7672}
7673
34dc7c2f
BB
7674void
7675arc_init(void)
7676{
9edb3695 7677 uint64_t percent, allmem = arc_all_memory();
ca67b33a 7678
ca0bf58d
PS
7679 mutex_init(&arc_reclaim_lock, NULL, MUTEX_DEFAULT, NULL);
7680 cv_init(&arc_reclaim_thread_cv, NULL, CV_DEFAULT, NULL);
7681 cv_init(&arc_reclaim_waiters_cv, NULL, CV_DEFAULT, NULL);
7682
2b84817f
TC
7683 arc_min_prefetch_ms = 1000;
7684 arc_min_prescient_prefetch_ms = 6000;
34dc7c2f 7685
34dc7c2f 7686#ifdef _KERNEL
7cb67b45
BB
7687 /*
7688 * Register a shrinker to support synchronous (direct) memory
7689 * reclaim from the arc. This is done to prevent kswapd from
7690 * swapping out pages when it is preferable to shrink the arc.
7691 */
7692 spl_register_shrinker(&arc_shrinker);
11f552fa
BB
7693
7694 /* Set to 1/64 of all memory or a minimum of 512K */
9edb3695 7695 arc_sys_free = MAX(allmem / 64, (512 * 1024));
11f552fa 7696 arc_need_free = 0;
34dc7c2f
BB
7697#endif
7698
0a1f8cd9
TC
7699 /* Set max to 1/2 of all memory */
7700 arc_c_max = allmem / 2;
7701
4ce3c45a
BB
7702#ifdef _KERNEL
7703 /* Set min cache to 1/32 of all memory, or 32MB, whichever is more */
7704 arc_c_min = MAX(allmem / 32, 2ULL << SPA_MAXBLOCKSHIFT);
7705#else
ab5cbbd1
BB
7706 /*
7707 * In userland, there's only the memory pressure that we artificially
7708 * create (see arc_available_memory()). Don't let arc_c get too
7709 * small, because it can cause transactions to be larger than
7710 * arc_c, causing arc_tempreserve_space() to fail.
7711 */
0a1f8cd9 7712 arc_c_min = MAX(arc_c_max / 2, 2ULL << SPA_MAXBLOCKSHIFT);
ab5cbbd1
BB
7713#endif
7714
34dc7c2f
BB
7715 arc_c = arc_c_max;
7716 arc_p = (arc_c >> 1);
7717
ca67b33a
MA
7718 /* Set min to 1/2 of arc_c_min */
7719 arc_meta_min = 1ULL << SPA_MAXBLOCKSHIFT;
7720 /* Initialize maximum observed usage to zero */
1834f2d8 7721 arc_meta_max = 0;
9907cc1c
G
7722 /*
7723 * Set arc_meta_limit to a percent of arc_c_max with a floor of
7724 * arc_meta_min, and a ceiling of arc_c_max.
7725 */
7726 percent = MIN(zfs_arc_meta_limit_percent, 100);
7727 arc_meta_limit = MAX(arc_meta_min, (percent * arc_c_max) / 100);
7728 percent = MIN(zfs_arc_dnode_limit_percent, 100);
7729 arc_dnode_limit = (percent * arc_meta_limit) / 100;
34dc7c2f 7730
ca67b33a
MA
7731 /* Apply user specified tunings */
7732 arc_tuning_update();
c52fca13 7733
34dc7c2f
BB
7734 /* if kmem_flags are set, lets try to use less memory */
7735 if (kmem_debugging())
7736 arc_c = arc_c / 2;
7737 if (arc_c < arc_c_min)
7738 arc_c = arc_c_min;
7739
d3c2ae1c 7740 arc_state_init();
34dc7c2f
BB
7741 buf_init();
7742
ab26409d
BB
7743 list_create(&arc_prune_list, sizeof (arc_prune_t),
7744 offsetof(arc_prune_t, p_node));
ab26409d 7745 mutex_init(&arc_prune_mtx, NULL, MUTEX_DEFAULT, NULL);
34dc7c2f 7746
1229323d 7747 arc_prune_taskq = taskq_create("arc_prune", max_ncpus, defclsyspri,
aa9af22c 7748 max_ncpus, INT_MAX, TASKQ_PREPOPULATE | TASKQ_DYNAMIC);
f6046738 7749
d3c2ae1c
GW
7750 arc_reclaim_thread_exit = B_FALSE;
7751
34dc7c2f
BB
7752 arc_ksp = kstat_create("zfs", 0, "arcstats", "misc", KSTAT_TYPE_NAMED,
7753 sizeof (arc_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL);
7754
7755 if (arc_ksp != NULL) {
7756 arc_ksp->ks_data = &arc_stats;
13be560d 7757 arc_ksp->ks_update = arc_kstat_update;
34dc7c2f
BB
7758 kstat_install(arc_ksp);
7759 }
7760
ca67b33a 7761 (void) thread_create(NULL, 0, arc_reclaim_thread, NULL, 0, &p0,
1229323d 7762 TS_RUN, defclsyspri);
34dc7c2f 7763
d3c2ae1c 7764 arc_dead = B_FALSE;
b128c09f 7765 arc_warm = B_FALSE;
34dc7c2f 7766
e8b96c60
MA
7767 /*
7768 * Calculate maximum amount of dirty data per pool.
7769 *
7770 * If it has been set by a module parameter, take that.
7771 * Otherwise, use a percentage of physical memory defined by
7772 * zfs_dirty_data_max_percent (default 10%) with a cap at
e99932f7 7773 * zfs_dirty_data_max_max (default 4G or 25% of physical memory).
e8b96c60
MA
7774 */
7775 if (zfs_dirty_data_max_max == 0)
e99932f7
BB
7776 zfs_dirty_data_max_max = MIN(4ULL * 1024 * 1024 * 1024,
7777 allmem * zfs_dirty_data_max_max_percent / 100);
e8b96c60
MA
7778
7779 if (zfs_dirty_data_max == 0) {
9edb3695 7780 zfs_dirty_data_max = allmem *
e8b96c60
MA
7781 zfs_dirty_data_max_percent / 100;
7782 zfs_dirty_data_max = MIN(zfs_dirty_data_max,
7783 zfs_dirty_data_max_max);
7784 }
34dc7c2f
BB
7785}
7786
7787void
7788arc_fini(void)
7789{
ab26409d
BB
7790 arc_prune_t *p;
7791
7cb67b45
BB
7792#ifdef _KERNEL
7793 spl_unregister_shrinker(&arc_shrinker);
7794#endif /* _KERNEL */
7795
ca0bf58d 7796 mutex_enter(&arc_reclaim_lock);
d3c2ae1c 7797 arc_reclaim_thread_exit = B_TRUE;
ca0bf58d
PS
7798 /*
7799 * The reclaim thread will set arc_reclaim_thread_exit back to
d3c2ae1c 7800 * B_FALSE when it is finished exiting; we're waiting for that.
ca0bf58d
PS
7801 */
7802 while (arc_reclaim_thread_exit) {
7803 cv_signal(&arc_reclaim_thread_cv);
7804 cv_wait(&arc_reclaim_thread_cv, &arc_reclaim_lock);
7805 }
7806 mutex_exit(&arc_reclaim_lock);
7807
d3c2ae1c
GW
7808 /* Use B_TRUE to ensure *all* buffers are evicted */
7809 arc_flush(NULL, B_TRUE);
34dc7c2f 7810
d3c2ae1c 7811 arc_dead = B_TRUE;
34dc7c2f
BB
7812
7813 if (arc_ksp != NULL) {
7814 kstat_delete(arc_ksp);
7815 arc_ksp = NULL;
7816 }
7817
f6046738
BB
7818 taskq_wait(arc_prune_taskq);
7819 taskq_destroy(arc_prune_taskq);
7820
ab26409d
BB
7821 mutex_enter(&arc_prune_mtx);
7822 while ((p = list_head(&arc_prune_list)) != NULL) {
7823 list_remove(&arc_prune_list, p);
7824 refcount_remove(&p->p_refcnt, &arc_prune_list);
7825 refcount_destroy(&p->p_refcnt);
7826 kmem_free(p, sizeof (*p));
7827 }
7828 mutex_exit(&arc_prune_mtx);
7829
7830 list_destroy(&arc_prune_list);
7831 mutex_destroy(&arc_prune_mtx);
ca0bf58d
PS
7832 mutex_destroy(&arc_reclaim_lock);
7833 cv_destroy(&arc_reclaim_thread_cv);
7834 cv_destroy(&arc_reclaim_waiters_cv);
7835
d3c2ae1c 7836 arc_state_fini();
34dc7c2f 7837 buf_fini();
9babb374 7838
b9541d6b 7839 ASSERT0(arc_loaned_bytes);
34dc7c2f
BB
7840}
7841
7842/*
7843 * Level 2 ARC
7844 *
7845 * The level 2 ARC (L2ARC) is a cache layer in-between main memory and disk.
7846 * It uses dedicated storage devices to hold cached data, which are populated
7847 * using large infrequent writes. The main role of this cache is to boost
7848 * the performance of random read workloads. The intended L2ARC devices
7849 * include short-stroked disks, solid state disks, and other media with
7850 * substantially faster read latency than disk.
7851 *
7852 * +-----------------------+
7853 * | ARC |
7854 * +-----------------------+
7855 * | ^ ^
7856 * | | |
7857 * l2arc_feed_thread() arc_read()
7858 * | | |
7859 * | l2arc read |
7860 * V | |
7861 * +---------------+ |
7862 * | L2ARC | |
7863 * +---------------+ |
7864 * | ^ |
7865 * l2arc_write() | |
7866 * | | |
7867 * V | |
7868 * +-------+ +-------+
7869 * | vdev | | vdev |
7870 * | cache | | cache |
7871 * +-------+ +-------+
7872 * +=========+ .-----.
7873 * : L2ARC : |-_____-|
7874 * : devices : | Disks |
7875 * +=========+ `-_____-'
7876 *
7877 * Read requests are satisfied from the following sources, in order:
7878 *
7879 * 1) ARC
7880 * 2) vdev cache of L2ARC devices
7881 * 3) L2ARC devices
7882 * 4) vdev cache of disks
7883 * 5) disks
7884 *
7885 * Some L2ARC device types exhibit extremely slow write performance.
7886 * To accommodate for this there are some significant differences between
7887 * the L2ARC and traditional cache design:
7888 *
7889 * 1. There is no eviction path from the ARC to the L2ARC. Evictions from
7890 * the ARC behave as usual, freeing buffers and placing headers on ghost
7891 * lists. The ARC does not send buffers to the L2ARC during eviction as
7892 * this would add inflated write latencies for all ARC memory pressure.
7893 *
7894 * 2. The L2ARC attempts to cache data from the ARC before it is evicted.
7895 * It does this by periodically scanning buffers from the eviction-end of
7896 * the MFU and MRU ARC lists, copying them to the L2ARC devices if they are
3a17a7a9
SK
7897 * not already there. It scans until a headroom of buffers is satisfied,
7898 * which itself is a buffer for ARC eviction. If a compressible buffer is
7899 * found during scanning and selected for writing to an L2ARC device, we
7900 * temporarily boost scanning headroom during the next scan cycle to make
7901 * sure we adapt to compression effects (which might significantly reduce
7902 * the data volume we write to L2ARC). The thread that does this is
34dc7c2f
BB
7903 * l2arc_feed_thread(), illustrated below; example sizes are included to
7904 * provide a better sense of ratio than this diagram:
7905 *
7906 * head --> tail
7907 * +---------------------+----------+
7908 * ARC_mfu |:::::#:::::::::::::::|o#o###o###|-->. # already on L2ARC
7909 * +---------------------+----------+ | o L2ARC eligible
7910 * ARC_mru |:#:::::::::::::::::::|#o#ooo####|-->| : ARC buffer
7911 * +---------------------+----------+ |
7912 * 15.9 Gbytes ^ 32 Mbytes |
7913 * headroom |
7914 * l2arc_feed_thread()
7915 * |
7916 * l2arc write hand <--[oooo]--'
7917 * | 8 Mbyte
7918 * | write max
7919 * V
7920 * +==============================+
7921 * L2ARC dev |####|#|###|###| |####| ... |
7922 * +==============================+
7923 * 32 Gbytes
7924 *
7925 * 3. If an ARC buffer is copied to the L2ARC but then hit instead of
7926 * evicted, then the L2ARC has cached a buffer much sooner than it probably
7927 * needed to, potentially wasting L2ARC device bandwidth and storage. It is
7928 * safe to say that this is an uncommon case, since buffers at the end of
7929 * the ARC lists have moved there due to inactivity.
7930 *
7931 * 4. If the ARC evicts faster than the L2ARC can maintain a headroom,
7932 * then the L2ARC simply misses copying some buffers. This serves as a
7933 * pressure valve to prevent heavy read workloads from both stalling the ARC
7934 * with waits and clogging the L2ARC with writes. This also helps prevent
7935 * the potential for the L2ARC to churn if it attempts to cache content too
7936 * quickly, such as during backups of the entire pool.
7937 *
b128c09f
BB
7938 * 5. After system boot and before the ARC has filled main memory, there are
7939 * no evictions from the ARC and so the tails of the ARC_mfu and ARC_mru
7940 * lists can remain mostly static. Instead of searching from tail of these
7941 * lists as pictured, the l2arc_feed_thread() will search from the list heads
7942 * for eligible buffers, greatly increasing its chance of finding them.
7943 *
7944 * The L2ARC device write speed is also boosted during this time so that
7945 * the L2ARC warms up faster. Since there have been no ARC evictions yet,
7946 * there are no L2ARC reads, and no fear of degrading read performance
7947 * through increased writes.
7948 *
7949 * 6. Writes to the L2ARC devices are grouped and sent in-sequence, so that
34dc7c2f
BB
7950 * the vdev queue can aggregate them into larger and fewer writes. Each
7951 * device is written to in a rotor fashion, sweeping writes through
7952 * available space then repeating.
7953 *
b128c09f 7954 * 7. The L2ARC does not store dirty content. It never needs to flush
34dc7c2f
BB
7955 * write buffers back to disk based storage.
7956 *
b128c09f 7957 * 8. If an ARC buffer is written (and dirtied) which also exists in the
34dc7c2f
BB
7958 * L2ARC, the now stale L2ARC buffer is immediately dropped.
7959 *
7960 * The performance of the L2ARC can be tweaked by a number of tunables, which
7961 * may be necessary for different workloads:
7962 *
7963 * l2arc_write_max max write bytes per interval
b128c09f 7964 * l2arc_write_boost extra write bytes during device warmup
34dc7c2f
BB
7965 * l2arc_noprefetch skip caching prefetched buffers
7966 * l2arc_headroom number of max device writes to precache
3a17a7a9
SK
7967 * l2arc_headroom_boost when we find compressed buffers during ARC
7968 * scanning, we multiply headroom by this
7969 * percentage factor for the next scan cycle,
7970 * since more compressed buffers are likely to
7971 * be present
34dc7c2f
BB
7972 * l2arc_feed_secs seconds between L2ARC writing
7973 *
7974 * Tunables may be removed or added as future performance improvements are
7975 * integrated, and also may become zpool properties.
d164b209
BB
7976 *
7977 * There are three key functions that control how the L2ARC warms up:
7978 *
7979 * l2arc_write_eligible() check if a buffer is eligible to cache
7980 * l2arc_write_size() calculate how much to write
7981 * l2arc_write_interval() calculate sleep delay between writes
7982 *
7983 * These three functions determine what to write, how much, and how quickly
7984 * to send writes.
34dc7c2f
BB
7985 */
7986
d164b209 7987static boolean_t
2a432414 7988l2arc_write_eligible(uint64_t spa_guid, arc_buf_hdr_t *hdr)
d164b209
BB
7989{
7990 /*
7991 * A buffer is *not* eligible for the L2ARC if it:
7992 * 1. belongs to a different spa.
428870ff
BB
7993 * 2. is already cached on the L2ARC.
7994 * 3. has an I/O in progress (it may be an incomplete read).
7995 * 4. is flagged not eligible (zfs property).
d164b209 7996 */
b9541d6b 7997 if (hdr->b_spa != spa_guid || HDR_HAS_L2HDR(hdr) ||
2a432414 7998 HDR_IO_IN_PROGRESS(hdr) || !HDR_L2CACHE(hdr))
d164b209
BB
7999 return (B_FALSE);
8000
8001 return (B_TRUE);
8002}
8003
8004static uint64_t
3a17a7a9 8005l2arc_write_size(void)
d164b209
BB
8006{
8007 uint64_t size;
8008
3a17a7a9
SK
8009 /*
8010 * Make sure our globals have meaningful values in case the user
8011 * altered them.
8012 */
8013 size = l2arc_write_max;
8014 if (size == 0) {
8015 cmn_err(CE_NOTE, "Bad value for l2arc_write_max, value must "
8016 "be greater than zero, resetting it to the default (%d)",
8017 L2ARC_WRITE_SIZE);
8018 size = l2arc_write_max = L2ARC_WRITE_SIZE;
8019 }
d164b209
BB
8020
8021 if (arc_warm == B_FALSE)
3a17a7a9 8022 size += l2arc_write_boost;
d164b209
BB
8023
8024 return (size);
8025
8026}
8027
8028static clock_t
8029l2arc_write_interval(clock_t began, uint64_t wanted, uint64_t wrote)
8030{
428870ff 8031 clock_t interval, next, now;
d164b209
BB
8032
8033 /*
8034 * If the ARC lists are busy, increase our write rate; if the
8035 * lists are stale, idle back. This is achieved by checking
8036 * how much we previously wrote - if it was more than half of
8037 * what we wanted, schedule the next write much sooner.
8038 */
8039 if (l2arc_feed_again && wrote > (wanted / 2))
8040 interval = (hz * l2arc_feed_min_ms) / 1000;
8041 else
8042 interval = hz * l2arc_feed_secs;
8043
428870ff
BB
8044 now = ddi_get_lbolt();
8045 next = MAX(now, MIN(now + interval, began + interval));
d164b209
BB
8046
8047 return (next);
8048}
8049
34dc7c2f
BB
8050/*
8051 * Cycle through L2ARC devices. This is how L2ARC load balances.
b128c09f 8052 * If a device is returned, this also returns holding the spa config lock.
34dc7c2f
BB
8053 */
8054static l2arc_dev_t *
8055l2arc_dev_get_next(void)
8056{
b128c09f 8057 l2arc_dev_t *first, *next = NULL;
34dc7c2f 8058
b128c09f
BB
8059 /*
8060 * Lock out the removal of spas (spa_namespace_lock), then removal
8061 * of cache devices (l2arc_dev_mtx). Once a device has been selected,
8062 * both locks will be dropped and a spa config lock held instead.
8063 */
8064 mutex_enter(&spa_namespace_lock);
8065 mutex_enter(&l2arc_dev_mtx);
8066
8067 /* if there are no vdevs, there is nothing to do */
8068 if (l2arc_ndev == 0)
8069 goto out;
8070
8071 first = NULL;
8072 next = l2arc_dev_last;
8073 do {
8074 /* loop around the list looking for a non-faulted vdev */
8075 if (next == NULL) {
34dc7c2f 8076 next = list_head(l2arc_dev_list);
b128c09f
BB
8077 } else {
8078 next = list_next(l2arc_dev_list, next);
8079 if (next == NULL)
8080 next = list_head(l2arc_dev_list);
8081 }
8082
8083 /* if we have come back to the start, bail out */
8084 if (first == NULL)
8085 first = next;
8086 else if (next == first)
8087 break;
8088
8089 } while (vdev_is_dead(next->l2ad_vdev));
8090
8091 /* if we were unable to find any usable vdevs, return NULL */
8092 if (vdev_is_dead(next->l2ad_vdev))
8093 next = NULL;
34dc7c2f
BB
8094
8095 l2arc_dev_last = next;
8096
b128c09f
BB
8097out:
8098 mutex_exit(&l2arc_dev_mtx);
8099
8100 /*
8101 * Grab the config lock to prevent the 'next' device from being
8102 * removed while we are writing to it.
8103 */
8104 if (next != NULL)
8105 spa_config_enter(next->l2ad_spa, SCL_L2ARC, next, RW_READER);
8106 mutex_exit(&spa_namespace_lock);
8107
34dc7c2f
BB
8108 return (next);
8109}
8110
b128c09f
BB
8111/*
8112 * Free buffers that were tagged for destruction.
8113 */
8114static void
0bc8fd78 8115l2arc_do_free_on_write(void)
b128c09f
BB
8116{
8117 list_t *buflist;
8118 l2arc_data_free_t *df, *df_prev;
8119
8120 mutex_enter(&l2arc_free_on_write_mtx);
8121 buflist = l2arc_free_on_write;
8122
8123 for (df = list_tail(buflist); df; df = df_prev) {
8124 df_prev = list_prev(buflist, df);
a6255b7f
DQ
8125 ASSERT3P(df->l2df_abd, !=, NULL);
8126 abd_free(df->l2df_abd);
b128c09f
BB
8127 list_remove(buflist, df);
8128 kmem_free(df, sizeof (l2arc_data_free_t));
8129 }
8130
8131 mutex_exit(&l2arc_free_on_write_mtx);
8132}
8133
34dc7c2f
BB
8134/*
8135 * A write to a cache device has completed. Update all headers to allow
8136 * reads from these buffers to begin.
8137 */
8138static void
8139l2arc_write_done(zio_t *zio)
8140{
8141 l2arc_write_callback_t *cb;
8142 l2arc_dev_t *dev;
8143 list_t *buflist;
2a432414 8144 arc_buf_hdr_t *head, *hdr, *hdr_prev;
34dc7c2f 8145 kmutex_t *hash_lock;
3bec585e 8146 int64_t bytes_dropped = 0;
34dc7c2f
BB
8147
8148 cb = zio->io_private;
d3c2ae1c 8149 ASSERT3P(cb, !=, NULL);
34dc7c2f 8150 dev = cb->l2wcb_dev;
d3c2ae1c 8151 ASSERT3P(dev, !=, NULL);
34dc7c2f 8152 head = cb->l2wcb_head;
d3c2ae1c 8153 ASSERT3P(head, !=, NULL);
b9541d6b 8154 buflist = &dev->l2ad_buflist;
d3c2ae1c 8155 ASSERT3P(buflist, !=, NULL);
34dc7c2f
BB
8156 DTRACE_PROBE2(l2arc__iodone, zio_t *, zio,
8157 l2arc_write_callback_t *, cb);
8158
8159 if (zio->io_error != 0)
8160 ARCSTAT_BUMP(arcstat_l2_writes_error);
8161
34dc7c2f
BB
8162 /*
8163 * All writes completed, or an error was hit.
8164 */
ca0bf58d
PS
8165top:
8166 mutex_enter(&dev->l2ad_mtx);
2a432414
GW
8167 for (hdr = list_prev(buflist, head); hdr; hdr = hdr_prev) {
8168 hdr_prev = list_prev(buflist, hdr);
34dc7c2f 8169
2a432414 8170 hash_lock = HDR_LOCK(hdr);
ca0bf58d
PS
8171
8172 /*
8173 * We cannot use mutex_enter or else we can deadlock
8174 * with l2arc_write_buffers (due to swapping the order
8175 * the hash lock and l2ad_mtx are taken).
8176 */
34dc7c2f
BB
8177 if (!mutex_tryenter(hash_lock)) {
8178 /*
ca0bf58d
PS
8179 * Missed the hash lock. We must retry so we
8180 * don't leave the ARC_FLAG_L2_WRITING bit set.
34dc7c2f 8181 */
ca0bf58d
PS
8182 ARCSTAT_BUMP(arcstat_l2_writes_lock_retry);
8183
8184 /*
8185 * We don't want to rescan the headers we've
8186 * already marked as having been written out, so
8187 * we reinsert the head node so we can pick up
8188 * where we left off.
8189 */
8190 list_remove(buflist, head);
8191 list_insert_after(buflist, hdr, head);
8192
8193 mutex_exit(&dev->l2ad_mtx);
8194
8195 /*
8196 * We wait for the hash lock to become available
8197 * to try and prevent busy waiting, and increase
8198 * the chance we'll be able to acquire the lock
8199 * the next time around.
8200 */
8201 mutex_enter(hash_lock);
8202 mutex_exit(hash_lock);
8203 goto top;
34dc7c2f
BB
8204 }
8205
b9541d6b 8206 /*
ca0bf58d
PS
8207 * We could not have been moved into the arc_l2c_only
8208 * state while in-flight due to our ARC_FLAG_L2_WRITING
8209 * bit being set. Let's just ensure that's being enforced.
8210 */
8211 ASSERT(HDR_HAS_L1HDR(hdr));
8212
8a09d5fd
BB
8213 /*
8214 * Skipped - drop L2ARC entry and mark the header as no
8215 * longer L2 eligibile.
8216 */
d3c2ae1c 8217 if (zio->io_error != 0) {
34dc7c2f 8218 /*
b128c09f 8219 * Error - drop L2ARC entry.
34dc7c2f 8220 */
2a432414 8221 list_remove(buflist, hdr);
d3c2ae1c 8222 arc_hdr_clear_flags(hdr, ARC_FLAG_HAS_L2HDR);
b9541d6b 8223
01850391
AG
8224 ARCSTAT_INCR(arcstat_l2_psize, -arc_hdr_size(hdr));
8225 ARCSTAT_INCR(arcstat_l2_lsize, -HDR_GET_LSIZE(hdr));
d962d5da 8226
d3c2ae1c 8227 bytes_dropped += arc_hdr_size(hdr);
d962d5da 8228 (void) refcount_remove_many(&dev->l2ad_alloc,
d3c2ae1c 8229 arc_hdr_size(hdr), hdr);
34dc7c2f
BB
8230 }
8231
8232 /*
ca0bf58d
PS
8233 * Allow ARC to begin reads and ghost list evictions to
8234 * this L2ARC entry.
34dc7c2f 8235 */
d3c2ae1c 8236 arc_hdr_clear_flags(hdr, ARC_FLAG_L2_WRITING);
34dc7c2f
BB
8237
8238 mutex_exit(hash_lock);
8239 }
8240
8241 atomic_inc_64(&l2arc_writes_done);
8242 list_remove(buflist, head);
b9541d6b
CW
8243 ASSERT(!HDR_HAS_L1HDR(head));
8244 kmem_cache_free(hdr_l2only_cache, head);
8245 mutex_exit(&dev->l2ad_mtx);
34dc7c2f 8246
3bec585e
SK
8247 vdev_space_update(dev->l2ad_vdev, -bytes_dropped, 0, 0);
8248
b128c09f 8249 l2arc_do_free_on_write();
34dc7c2f
BB
8250
8251 kmem_free(cb, sizeof (l2arc_write_callback_t));
8252}
8253
b5256303
TC
8254static int
8255l2arc_untransform(zio_t *zio, l2arc_read_callback_t *cb)
8256{
8257 int ret;
8258 spa_t *spa = zio->io_spa;
8259 arc_buf_hdr_t *hdr = cb->l2rcb_hdr;
8260 blkptr_t *bp = zio->io_bp;
b5256303
TC
8261 uint8_t salt[ZIO_DATA_SALT_LEN];
8262 uint8_t iv[ZIO_DATA_IV_LEN];
8263 uint8_t mac[ZIO_DATA_MAC_LEN];
8264 boolean_t no_crypt = B_FALSE;
8265
8266 /*
8267 * ZIL data is never be written to the L2ARC, so we don't need
8268 * special handling for its unique MAC storage.
8269 */
8270 ASSERT3U(BP_GET_TYPE(bp), !=, DMU_OT_INTENT_LOG);
8271 ASSERT(MUTEX_HELD(HDR_LOCK(hdr)));
440a3eb9 8272 ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL);
b5256303 8273
440a3eb9
TC
8274 /*
8275 * If the data was encrypted, decrypt it now. Note that
8276 * we must check the bp here and not the hdr, since the
8277 * hdr does not have its encryption parameters updated
8278 * until arc_read_done().
8279 */
8280 if (BP_IS_ENCRYPTED(bp)) {
be9a5c35 8281 abd_t *eabd = arc_get_data_abd(hdr, arc_hdr_size(hdr), hdr);
b5256303
TC
8282
8283 zio_crypt_decode_params_bp(bp, salt, iv);
8284 zio_crypt_decode_mac_bp(bp, mac);
8285
be9a5c35
TC
8286 ret = spa_do_crypt_abd(B_FALSE, spa, &cb->l2rcb_zb,
8287 BP_GET_TYPE(bp), BP_GET_DEDUP(bp), BP_SHOULD_BYTESWAP(bp),
8288 salt, iv, mac, HDR_GET_PSIZE(hdr), eabd,
8289 hdr->b_l1hdr.b_pabd, &no_crypt);
b5256303
TC
8290 if (ret != 0) {
8291 arc_free_data_abd(hdr, eabd, arc_hdr_size(hdr), hdr);
b5256303
TC
8292 goto error;
8293 }
8294
b5256303
TC
8295 /*
8296 * If we actually performed decryption, replace b_pabd
8297 * with the decrypted data. Otherwise we can just throw
8298 * our decryption buffer away.
8299 */
8300 if (!no_crypt) {
8301 arc_free_data_abd(hdr, hdr->b_l1hdr.b_pabd,
8302 arc_hdr_size(hdr), hdr);
8303 hdr->b_l1hdr.b_pabd = eabd;
8304 zio->io_abd = eabd;
8305 } else {
8306 arc_free_data_abd(hdr, eabd, arc_hdr_size(hdr), hdr);
8307 }
8308 }
8309
8310 /*
8311 * If the L2ARC block was compressed, but ARC compression
8312 * is disabled we decompress the data into a new buffer and
8313 * replace the existing data.
8314 */
8315 if (HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF &&
8316 !HDR_COMPRESSION_ENABLED(hdr)) {
8317 abd_t *cabd = arc_get_data_abd(hdr, arc_hdr_size(hdr), hdr);
8318 void *tmp = abd_borrow_buf(cabd, arc_hdr_size(hdr));
8319
8320 ret = zio_decompress_data(HDR_GET_COMPRESS(hdr),
8321 hdr->b_l1hdr.b_pabd, tmp, HDR_GET_PSIZE(hdr),
8322 HDR_GET_LSIZE(hdr));
8323 if (ret != 0) {
8324 abd_return_buf_copy(cabd, tmp, arc_hdr_size(hdr));
8325 arc_free_data_abd(hdr, cabd, arc_hdr_size(hdr), hdr);
8326 goto error;
8327 }
8328
8329 abd_return_buf_copy(cabd, tmp, arc_hdr_size(hdr));
8330 arc_free_data_abd(hdr, hdr->b_l1hdr.b_pabd,
8331 arc_hdr_size(hdr), hdr);
8332 hdr->b_l1hdr.b_pabd = cabd;
8333 zio->io_abd = cabd;
8334 zio->io_size = HDR_GET_LSIZE(hdr);
8335 }
8336
8337 return (0);
8338
8339error:
8340 return (ret);
8341}
8342
8343
34dc7c2f
BB
8344/*
8345 * A read to a cache device completed. Validate buffer contents before
8346 * handing over to the regular ARC routines.
8347 */
8348static void
8349l2arc_read_done(zio_t *zio)
8350{
b5256303 8351 int tfm_error = 0;
b405837a 8352 l2arc_read_callback_t *cb = zio->io_private;
34dc7c2f 8353 arc_buf_hdr_t *hdr;
34dc7c2f 8354 kmutex_t *hash_lock;
b405837a
TC
8355 boolean_t valid_cksum;
8356 boolean_t using_rdata = (BP_IS_ENCRYPTED(&cb->l2rcb_bp) &&
8357 (cb->l2rcb_flags & ZIO_FLAG_RAW_ENCRYPT));
b128c09f 8358
d3c2ae1c 8359 ASSERT3P(zio->io_vd, !=, NULL);
b128c09f
BB
8360 ASSERT(zio->io_flags & ZIO_FLAG_DONT_PROPAGATE);
8361
8362 spa_config_exit(zio->io_spa, SCL_L2ARC, zio->io_vd);
34dc7c2f 8363
d3c2ae1c
GW
8364 ASSERT3P(cb, !=, NULL);
8365 hdr = cb->l2rcb_hdr;
8366 ASSERT3P(hdr, !=, NULL);
34dc7c2f 8367
d3c2ae1c 8368 hash_lock = HDR_LOCK(hdr);
34dc7c2f 8369 mutex_enter(hash_lock);
428870ff 8370 ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
34dc7c2f 8371
82710e99
GDN
8372 /*
8373 * If the data was read into a temporary buffer,
8374 * move it and free the buffer.
8375 */
8376 if (cb->l2rcb_abd != NULL) {
8377 ASSERT3U(arc_hdr_size(hdr), <, zio->io_size);
8378 if (zio->io_error == 0) {
b405837a
TC
8379 if (using_rdata) {
8380 abd_copy(hdr->b_crypt_hdr.b_rabd,
8381 cb->l2rcb_abd, arc_hdr_size(hdr));
8382 } else {
8383 abd_copy(hdr->b_l1hdr.b_pabd,
8384 cb->l2rcb_abd, arc_hdr_size(hdr));
8385 }
82710e99
GDN
8386 }
8387
8388 /*
8389 * The following must be done regardless of whether
8390 * there was an error:
8391 * - free the temporary buffer
8392 * - point zio to the real ARC buffer
8393 * - set zio size accordingly
8394 * These are required because zio is either re-used for
8395 * an I/O of the block in the case of the error
8396 * or the zio is passed to arc_read_done() and it
8397 * needs real data.
8398 */
8399 abd_free(cb->l2rcb_abd);
8400 zio->io_size = zio->io_orig_size = arc_hdr_size(hdr);
440a3eb9 8401
b405837a 8402 if (using_rdata) {
440a3eb9
TC
8403 ASSERT(HDR_HAS_RABD(hdr));
8404 zio->io_abd = zio->io_orig_abd =
8405 hdr->b_crypt_hdr.b_rabd;
8406 } else {
8407 ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL);
8408 zio->io_abd = zio->io_orig_abd = hdr->b_l1hdr.b_pabd;
8409 }
82710e99
GDN
8410 }
8411
a6255b7f 8412 ASSERT3P(zio->io_abd, !=, NULL);
3a17a7a9 8413
34dc7c2f
BB
8414 /*
8415 * Check this survived the L2ARC journey.
8416 */
b5256303
TC
8417 ASSERT(zio->io_abd == hdr->b_l1hdr.b_pabd ||
8418 (HDR_HAS_RABD(hdr) && zio->io_abd == hdr->b_crypt_hdr.b_rabd));
d3c2ae1c
GW
8419 zio->io_bp_copy = cb->l2rcb_bp; /* XXX fix in L2ARC 2.0 */
8420 zio->io_bp = &zio->io_bp_copy; /* XXX fix in L2ARC 2.0 */
8421
8422 valid_cksum = arc_cksum_is_equal(hdr, zio);
b5256303
TC
8423
8424 /*
8425 * b_rabd will always match the data as it exists on disk if it is
8426 * being used. Therefore if we are reading into b_rabd we do not
8427 * attempt to untransform the data.
8428 */
8429 if (valid_cksum && !using_rdata)
8430 tfm_error = l2arc_untransform(zio, cb);
8431
8432 if (valid_cksum && tfm_error == 0 && zio->io_error == 0 &&
8433 !HDR_L2_EVICTED(hdr)) {
34dc7c2f 8434 mutex_exit(hash_lock);
d3c2ae1c 8435 zio->io_private = hdr;
34dc7c2f
BB
8436 arc_read_done(zio);
8437 } else {
8438 mutex_exit(hash_lock);
8439 /*
8440 * Buffer didn't survive caching. Increment stats and
8441 * reissue to the original storage device.
8442 */
b128c09f 8443 if (zio->io_error != 0) {
34dc7c2f 8444 ARCSTAT_BUMP(arcstat_l2_io_error);
b128c09f 8445 } else {
2e528b49 8446 zio->io_error = SET_ERROR(EIO);
b128c09f 8447 }
b5256303 8448 if (!valid_cksum || tfm_error != 0)
34dc7c2f
BB
8449 ARCSTAT_BUMP(arcstat_l2_cksum_bad);
8450
34dc7c2f 8451 /*
b128c09f
BB
8452 * If there's no waiter, issue an async i/o to the primary
8453 * storage now. If there *is* a waiter, the caller must
8454 * issue the i/o in a context where it's OK to block.
34dc7c2f 8455 */
d164b209
BB
8456 if (zio->io_waiter == NULL) {
8457 zio_t *pio = zio_unique_parent(zio);
b5256303
TC
8458 void *abd = (using_rdata) ?
8459 hdr->b_crypt_hdr.b_rabd : hdr->b_l1hdr.b_pabd;
d164b209
BB
8460
8461 ASSERT(!pio || pio->io_child_type == ZIO_CHILD_LOGICAL);
8462
d3c2ae1c 8463 zio_nowait(zio_read(pio, zio->io_spa, zio->io_bp,
b5256303 8464 abd, zio->io_size, arc_read_done,
d3c2ae1c
GW
8465 hdr, zio->io_priority, cb->l2rcb_flags,
8466 &cb->l2rcb_zb));
d164b209 8467 }
34dc7c2f
BB
8468 }
8469
8470 kmem_free(cb, sizeof (l2arc_read_callback_t));
8471}
8472
8473/*
8474 * This is the list priority from which the L2ARC will search for pages to
8475 * cache. This is used within loops (0..3) to cycle through lists in the
8476 * desired order. This order can have a significant effect on cache
8477 * performance.
8478 *
8479 * Currently the metadata lists are hit first, MFU then MRU, followed by
8480 * the data lists. This function returns a locked list, and also returns
8481 * the lock pointer.
8482 */
ca0bf58d
PS
8483static multilist_sublist_t *
8484l2arc_sublist_lock(int list_num)
34dc7c2f 8485{
ca0bf58d
PS
8486 multilist_t *ml = NULL;
8487 unsigned int idx;
34dc7c2f 8488
4aafab91 8489 ASSERT(list_num >= 0 && list_num < L2ARC_FEED_TYPES);
34dc7c2f
BB
8490
8491 switch (list_num) {
8492 case 0:
64fc7762 8493 ml = arc_mfu->arcs_list[ARC_BUFC_METADATA];
34dc7c2f
BB
8494 break;
8495 case 1:
64fc7762 8496 ml = arc_mru->arcs_list[ARC_BUFC_METADATA];
34dc7c2f
BB
8497 break;
8498 case 2:
64fc7762 8499 ml = arc_mfu->arcs_list[ARC_BUFC_DATA];
34dc7c2f
BB
8500 break;
8501 case 3:
64fc7762 8502 ml = arc_mru->arcs_list[ARC_BUFC_DATA];
34dc7c2f 8503 break;
4aafab91
G
8504 default:
8505 return (NULL);
34dc7c2f
BB
8506 }
8507
ca0bf58d
PS
8508 /*
8509 * Return a randomly-selected sublist. This is acceptable
8510 * because the caller feeds only a little bit of data for each
8511 * call (8MB). Subsequent calls will result in different
8512 * sublists being selected.
8513 */
8514 idx = multilist_get_random_index(ml);
8515 return (multilist_sublist_lock(ml, idx));
34dc7c2f
BB
8516}
8517
8518/*
8519 * Evict buffers from the device write hand to the distance specified in
8520 * bytes. This distance may span populated buffers, it may span nothing.
8521 * This is clearing a region on the L2ARC device ready for writing.
8522 * If the 'all' boolean is set, every buffer is evicted.
8523 */
8524static void
8525l2arc_evict(l2arc_dev_t *dev, uint64_t distance, boolean_t all)
8526{
8527 list_t *buflist;
2a432414 8528 arc_buf_hdr_t *hdr, *hdr_prev;
34dc7c2f
BB
8529 kmutex_t *hash_lock;
8530 uint64_t taddr;
8531
b9541d6b 8532 buflist = &dev->l2ad_buflist;
34dc7c2f
BB
8533
8534 if (!all && dev->l2ad_first) {
8535 /*
8536 * This is the first sweep through the device. There is
8537 * nothing to evict.
8538 */
8539 return;
8540 }
8541
b128c09f 8542 if (dev->l2ad_hand >= (dev->l2ad_end - (2 * distance))) {
34dc7c2f
BB
8543 /*
8544 * When nearing the end of the device, evict to the end
8545 * before the device write hand jumps to the start.
8546 */
8547 taddr = dev->l2ad_end;
8548 } else {
8549 taddr = dev->l2ad_hand + distance;
8550 }
8551 DTRACE_PROBE4(l2arc__evict, l2arc_dev_t *, dev, list_t *, buflist,
8552 uint64_t, taddr, boolean_t, all);
8553
8554top:
b9541d6b 8555 mutex_enter(&dev->l2ad_mtx);
2a432414
GW
8556 for (hdr = list_tail(buflist); hdr; hdr = hdr_prev) {
8557 hdr_prev = list_prev(buflist, hdr);
34dc7c2f 8558
2a432414 8559 hash_lock = HDR_LOCK(hdr);
ca0bf58d
PS
8560
8561 /*
8562 * We cannot use mutex_enter or else we can deadlock
8563 * with l2arc_write_buffers (due to swapping the order
8564 * the hash lock and l2ad_mtx are taken).
8565 */
34dc7c2f
BB
8566 if (!mutex_tryenter(hash_lock)) {
8567 /*
8568 * Missed the hash lock. Retry.
8569 */
8570 ARCSTAT_BUMP(arcstat_l2_evict_lock_retry);
b9541d6b 8571 mutex_exit(&dev->l2ad_mtx);
34dc7c2f
BB
8572 mutex_enter(hash_lock);
8573 mutex_exit(hash_lock);
8574 goto top;
8575 }
8576
f06f53fa
AG
8577 /*
8578 * A header can't be on this list if it doesn't have L2 header.
8579 */
8580 ASSERT(HDR_HAS_L2HDR(hdr));
34dc7c2f 8581
f06f53fa
AG
8582 /* Ensure this header has finished being written. */
8583 ASSERT(!HDR_L2_WRITING(hdr));
8584 ASSERT(!HDR_L2_WRITE_HEAD(hdr));
8585
8586 if (!all && (hdr->b_l2hdr.b_daddr >= taddr ||
b9541d6b 8587 hdr->b_l2hdr.b_daddr < dev->l2ad_hand)) {
34dc7c2f
BB
8588 /*
8589 * We've evicted to the target address,
8590 * or the end of the device.
8591 */
8592 mutex_exit(hash_lock);
8593 break;
8594 }
8595
b9541d6b 8596 if (!HDR_HAS_L1HDR(hdr)) {
2a432414 8597 ASSERT(!HDR_L2_READING(hdr));
34dc7c2f
BB
8598 /*
8599 * This doesn't exist in the ARC. Destroy.
8600 * arc_hdr_destroy() will call list_remove()
01850391 8601 * and decrement arcstat_l2_lsize.
34dc7c2f 8602 */
2a432414
GW
8603 arc_change_state(arc_anon, hdr, hash_lock);
8604 arc_hdr_destroy(hdr);
34dc7c2f 8605 } else {
b9541d6b
CW
8606 ASSERT(hdr->b_l1hdr.b_state != arc_l2c_only);
8607 ARCSTAT_BUMP(arcstat_l2_evict_l1cached);
b128c09f
BB
8608 /*
8609 * Invalidate issued or about to be issued
8610 * reads, since we may be about to write
8611 * over this location.
8612 */
2a432414 8613 if (HDR_L2_READING(hdr)) {
b128c09f 8614 ARCSTAT_BUMP(arcstat_l2_evict_reading);
d3c2ae1c 8615 arc_hdr_set_flags(hdr, ARC_FLAG_L2_EVICTED);
b128c09f
BB
8616 }
8617
d962d5da 8618 arc_hdr_l2hdr_destroy(hdr);
34dc7c2f
BB
8619 }
8620 mutex_exit(hash_lock);
8621 }
b9541d6b 8622 mutex_exit(&dev->l2ad_mtx);
34dc7c2f
BB
8623}
8624
b5256303
TC
8625/*
8626 * Handle any abd transforms that might be required for writing to the L2ARC.
8627 * If successful, this function will always return an abd with the data
8628 * transformed as it is on disk in a new abd of asize bytes.
8629 */
8630static int
8631l2arc_apply_transforms(spa_t *spa, arc_buf_hdr_t *hdr, uint64_t asize,
8632 abd_t **abd_out)
8633{
8634 int ret;
8635 void *tmp = NULL;
8636 abd_t *cabd = NULL, *eabd = NULL, *to_write = hdr->b_l1hdr.b_pabd;
8637 enum zio_compress compress = HDR_GET_COMPRESS(hdr);
8638 uint64_t psize = HDR_GET_PSIZE(hdr);
8639 uint64_t size = arc_hdr_size(hdr);
8640 boolean_t ismd = HDR_ISTYPE_METADATA(hdr);
8641 boolean_t bswap = (hdr->b_l1hdr.b_byteswap != DMU_BSWAP_NUMFUNCS);
8642 dsl_crypto_key_t *dck = NULL;
8643 uint8_t mac[ZIO_DATA_MAC_LEN] = { 0 };
4807c0ba 8644 boolean_t no_crypt = B_FALSE;
b5256303
TC
8645
8646 ASSERT((HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF &&
8647 !HDR_COMPRESSION_ENABLED(hdr)) ||
8648 HDR_ENCRYPTED(hdr) || HDR_SHARED_DATA(hdr) || psize != asize);
8649 ASSERT3U(psize, <=, asize);
8650
8651 /*
8652 * If this data simply needs its own buffer, we simply allocate it
8653 * and copy the data. This may be done to elimiate a depedency on a
8654 * shared buffer or to reallocate the buffer to match asize.
8655 */
4807c0ba 8656 if (HDR_HAS_RABD(hdr) && asize != psize) {
10adee27 8657 ASSERT3U(asize, >=, psize);
4807c0ba 8658 to_write = abd_alloc_for_io(asize, ismd);
10adee27
TC
8659 abd_copy(to_write, hdr->b_crypt_hdr.b_rabd, psize);
8660 if (psize != asize)
8661 abd_zero_off(to_write, psize, asize - psize);
4807c0ba
TC
8662 goto out;
8663 }
8664
b5256303
TC
8665 if ((compress == ZIO_COMPRESS_OFF || HDR_COMPRESSION_ENABLED(hdr)) &&
8666 !HDR_ENCRYPTED(hdr)) {
8667 ASSERT3U(size, ==, psize);
8668 to_write = abd_alloc_for_io(asize, ismd);
8669 abd_copy(to_write, hdr->b_l1hdr.b_pabd, size);
8670 if (size != asize)
8671 abd_zero_off(to_write, size, asize - size);
8672 goto out;
8673 }
8674
8675 if (compress != ZIO_COMPRESS_OFF && !HDR_COMPRESSION_ENABLED(hdr)) {
8676 cabd = abd_alloc_for_io(asize, ismd);
8677 tmp = abd_borrow_buf(cabd, asize);
8678
8679 psize = zio_compress_data(compress, to_write, tmp, size);
8680 ASSERT3U(psize, <=, HDR_GET_PSIZE(hdr));
8681 if (psize < asize)
8682 bzero((char *)tmp + psize, asize - psize);
8683 psize = HDR_GET_PSIZE(hdr);
8684 abd_return_buf_copy(cabd, tmp, asize);
8685 to_write = cabd;
8686 }
8687
8688 if (HDR_ENCRYPTED(hdr)) {
8689 eabd = abd_alloc_for_io(asize, ismd);
8690
8691 /*
8692 * If the dataset was disowned before the buffer
8693 * made it to this point, the key to re-encrypt
8694 * it won't be available. In this case we simply
8695 * won't write the buffer to the L2ARC.
8696 */
8697 ret = spa_keystore_lookup_key(spa, hdr->b_crypt_hdr.b_dsobj,
8698 FTAG, &dck);
8699 if (ret != 0)
8700 goto error;
8701
8702 ret = zio_do_crypt_abd(B_TRUE, &dck->dck_key,
be9a5c35
TC
8703 hdr->b_crypt_hdr.b_ot, bswap, hdr->b_crypt_hdr.b_salt,
8704 hdr->b_crypt_hdr.b_iv, mac, psize, to_write, eabd,
8705 &no_crypt);
b5256303
TC
8706 if (ret != 0)
8707 goto error;
8708
4807c0ba
TC
8709 if (no_crypt)
8710 abd_copy(eabd, to_write, psize);
b5256303
TC
8711
8712 if (psize != asize)
8713 abd_zero_off(eabd, psize, asize - psize);
8714
8715 /* assert that the MAC we got here matches the one we saved */
8716 ASSERT0(bcmp(mac, hdr->b_crypt_hdr.b_mac, ZIO_DATA_MAC_LEN));
8717 spa_keystore_dsl_key_rele(spa, dck, FTAG);
8718
8719 if (to_write == cabd)
8720 abd_free(cabd);
8721
8722 to_write = eabd;
8723 }
8724
8725out:
8726 ASSERT3P(to_write, !=, hdr->b_l1hdr.b_pabd);
8727 *abd_out = to_write;
8728 return (0);
8729
8730error:
8731 if (dck != NULL)
8732 spa_keystore_dsl_key_rele(spa, dck, FTAG);
8733 if (cabd != NULL)
8734 abd_free(cabd);
8735 if (eabd != NULL)
8736 abd_free(eabd);
8737
8738 *abd_out = NULL;
8739 return (ret);
8740}
8741
34dc7c2f
BB
8742/*
8743 * Find and write ARC buffers to the L2ARC device.
8744 *
2a432414 8745 * An ARC_FLAG_L2_WRITING flag is set so that the L2ARC buffers are not valid
34dc7c2f 8746 * for reading until they have completed writing.
3a17a7a9
SK
8747 * The headroom_boost is an in-out parameter used to maintain headroom boost
8748 * state between calls to this function.
8749 *
8750 * Returns the number of bytes actually written (which may be smaller than
8751 * the delta by which the device hand has changed due to alignment).
34dc7c2f 8752 */
d164b209 8753static uint64_t
d3c2ae1c 8754l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz)
34dc7c2f 8755{
2a432414 8756 arc_buf_hdr_t *hdr, *hdr_prev, *head;
01850391 8757 uint64_t write_asize, write_psize, write_lsize, headroom;
3a17a7a9 8758 boolean_t full;
34dc7c2f
BB
8759 l2arc_write_callback_t *cb;
8760 zio_t *pio, *wzio;
3541dc6d 8761 uint64_t guid = spa_load_guid(spa);
34dc7c2f 8762
d3c2ae1c 8763 ASSERT3P(dev->l2ad_vdev, !=, NULL);
3a17a7a9 8764
34dc7c2f 8765 pio = NULL;
01850391 8766 write_lsize = write_asize = write_psize = 0;
34dc7c2f 8767 full = B_FALSE;
b9541d6b 8768 head = kmem_cache_alloc(hdr_l2only_cache, KM_PUSHPAGE);
d3c2ae1c 8769 arc_hdr_set_flags(head, ARC_FLAG_L2_WRITE_HEAD | ARC_FLAG_HAS_L2HDR);
3a17a7a9 8770
34dc7c2f
BB
8771 /*
8772 * Copy buffers for L2ARC writing.
8773 */
1c27024e 8774 for (int try = 0; try < L2ARC_FEED_TYPES; try++) {
ca0bf58d 8775 multilist_sublist_t *mls = l2arc_sublist_lock(try);
3a17a7a9
SK
8776 uint64_t passed_sz = 0;
8777
4aafab91
G
8778 VERIFY3P(mls, !=, NULL);
8779
b128c09f
BB
8780 /*
8781 * L2ARC fast warmup.
8782 *
8783 * Until the ARC is warm and starts to evict, read from the
8784 * head of the ARC lists rather than the tail.
8785 */
b128c09f 8786 if (arc_warm == B_FALSE)
ca0bf58d 8787 hdr = multilist_sublist_head(mls);
b128c09f 8788 else
ca0bf58d 8789 hdr = multilist_sublist_tail(mls);
b128c09f 8790
3a17a7a9 8791 headroom = target_sz * l2arc_headroom;
d3c2ae1c 8792 if (zfs_compressed_arc_enabled)
3a17a7a9
SK
8793 headroom = (headroom * l2arc_headroom_boost) / 100;
8794
2a432414 8795 for (; hdr; hdr = hdr_prev) {
3a17a7a9 8796 kmutex_t *hash_lock;
b5256303 8797 abd_t *to_write = NULL;
3a17a7a9 8798
b128c09f 8799 if (arc_warm == B_FALSE)
ca0bf58d 8800 hdr_prev = multilist_sublist_next(mls, hdr);
b128c09f 8801 else
ca0bf58d 8802 hdr_prev = multilist_sublist_prev(mls, hdr);
34dc7c2f 8803
2a432414 8804 hash_lock = HDR_LOCK(hdr);
3a17a7a9 8805 if (!mutex_tryenter(hash_lock)) {
34dc7c2f
BB
8806 /*
8807 * Skip this buffer rather than waiting.
8808 */
8809 continue;
8810 }
8811
d3c2ae1c 8812 passed_sz += HDR_GET_LSIZE(hdr);
34dc7c2f
BB
8813 if (passed_sz > headroom) {
8814 /*
8815 * Searched too far.
8816 */
8817 mutex_exit(hash_lock);
8818 break;
8819 }
8820
2a432414 8821 if (!l2arc_write_eligible(guid, hdr)) {
34dc7c2f
BB
8822 mutex_exit(hash_lock);
8823 continue;
8824 }
8825
01850391
AG
8826 /*
8827 * We rely on the L1 portion of the header below, so
8828 * it's invalid for this header to have been evicted out
8829 * of the ghost cache, prior to being written out. The
8830 * ARC_FLAG_L2_WRITING bit ensures this won't happen.
8831 */
8832 ASSERT(HDR_HAS_L1HDR(hdr));
8833
8834 ASSERT3U(HDR_GET_PSIZE(hdr), >, 0);
01850391 8835 ASSERT3U(arc_hdr_size(hdr), >, 0);
b5256303
TC
8836 ASSERT(hdr->b_l1hdr.b_pabd != NULL ||
8837 HDR_HAS_RABD(hdr));
8838 uint64_t psize = HDR_GET_PSIZE(hdr);
01850391
AG
8839 uint64_t asize = vdev_psize_to_asize(dev->l2ad_vdev,
8840 psize);
8841
8842 if ((write_asize + asize) > target_sz) {
34dc7c2f
BB
8843 full = B_TRUE;
8844 mutex_exit(hash_lock);
8845 break;
8846 }
8847
b5256303
TC
8848 /*
8849 * We rely on the L1 portion of the header below, so
8850 * it's invalid for this header to have been evicted out
8851 * of the ghost cache, prior to being written out. The
8852 * ARC_FLAG_L2_WRITING bit ensures this won't happen.
8853 */
8854 arc_hdr_set_flags(hdr, ARC_FLAG_L2_WRITING);
8855 ASSERT(HDR_HAS_L1HDR(hdr));
8856
8857 ASSERT3U(HDR_GET_PSIZE(hdr), >, 0);
8858 ASSERT(hdr->b_l1hdr.b_pabd != NULL ||
8859 HDR_HAS_RABD(hdr));
8860 ASSERT3U(arc_hdr_size(hdr), >, 0);
8861
8862 /*
8863 * If this header has b_rabd, we can use this since it
8864 * must always match the data exactly as it exists on
8865 * disk. Otherwise, the L2ARC can normally use the
8866 * hdr's data, but if we're sharing data between the
8867 * hdr and one of its bufs, L2ARC needs its own copy of
8868 * the data so that the ZIO below can't race with the
8869 * buf consumer. To ensure that this copy will be
8870 * available for the lifetime of the ZIO and be cleaned
8871 * up afterwards, we add it to the l2arc_free_on_write
8872 * queue. If we need to apply any transforms to the
8873 * data (compression, encryption) we will also need the
8874 * extra buffer.
8875 */
8876 if (HDR_HAS_RABD(hdr) && psize == asize) {
8877 to_write = hdr->b_crypt_hdr.b_rabd;
8878 } else if ((HDR_COMPRESSION_ENABLED(hdr) ||
8879 HDR_GET_COMPRESS(hdr) == ZIO_COMPRESS_OFF) &&
8880 !HDR_ENCRYPTED(hdr) && !HDR_SHARED_DATA(hdr) &&
8881 psize == asize) {
8882 to_write = hdr->b_l1hdr.b_pabd;
8883 } else {
8884 int ret;
8885 arc_buf_contents_t type = arc_buf_type(hdr);
8886
8887 ret = l2arc_apply_transforms(spa, hdr, asize,
8888 &to_write);
8889 if (ret != 0) {
8890 arc_hdr_clear_flags(hdr,
8891 ARC_FLAG_L2_WRITING);
8892 mutex_exit(hash_lock);
8893 continue;
8894 }
8895
8896 l2arc_free_abd_on_write(to_write, asize, type);
8897 }
8898
34dc7c2f
BB
8899 if (pio == NULL) {
8900 /*
8901 * Insert a dummy header on the buflist so
8902 * l2arc_write_done() can find where the
8903 * write buffers begin without searching.
8904 */
ca0bf58d 8905 mutex_enter(&dev->l2ad_mtx);
b9541d6b 8906 list_insert_head(&dev->l2ad_buflist, head);
ca0bf58d 8907 mutex_exit(&dev->l2ad_mtx);
34dc7c2f 8908
96c080cb
BB
8909 cb = kmem_alloc(
8910 sizeof (l2arc_write_callback_t), KM_SLEEP);
34dc7c2f
BB
8911 cb->l2wcb_dev = dev;
8912 cb->l2wcb_head = head;
8913 pio = zio_root(spa, l2arc_write_done, cb,
8914 ZIO_FLAG_CANFAIL);
8915 }
8916
b9541d6b 8917 hdr->b_l2hdr.b_dev = dev;
b9541d6b 8918 hdr->b_l2hdr.b_hits = 0;
3a17a7a9 8919
d3c2ae1c 8920 hdr->b_l2hdr.b_daddr = dev->l2ad_hand;
b5256303 8921 arc_hdr_set_flags(hdr, ARC_FLAG_HAS_L2HDR);
3a17a7a9 8922
ca0bf58d 8923 mutex_enter(&dev->l2ad_mtx);
b9541d6b 8924 list_insert_head(&dev->l2ad_buflist, hdr);
ca0bf58d 8925 mutex_exit(&dev->l2ad_mtx);
34dc7c2f 8926
b5256303
TC
8927 (void) refcount_add_many(&dev->l2ad_alloc,
8928 arc_hdr_size(hdr), hdr);
3a17a7a9 8929
34dc7c2f 8930 wzio = zio_write_phys(pio, dev->l2ad_vdev,
82710e99 8931 hdr->b_l2hdr.b_daddr, asize, to_write,
d3c2ae1c
GW
8932 ZIO_CHECKSUM_OFF, NULL, hdr,
8933 ZIO_PRIORITY_ASYNC_WRITE,
34dc7c2f
BB
8934 ZIO_FLAG_CANFAIL, B_FALSE);
8935
01850391 8936 write_lsize += HDR_GET_LSIZE(hdr);
34dc7c2f
BB
8937 DTRACE_PROBE2(l2arc__write, vdev_t *, dev->l2ad_vdev,
8938 zio_t *, wzio);
d962d5da 8939
01850391
AG
8940 write_psize += psize;
8941 write_asize += asize;
d3c2ae1c
GW
8942 dev->l2ad_hand += asize;
8943
8944 mutex_exit(hash_lock);
8945
8946 (void) zio_nowait(wzio);
34dc7c2f 8947 }
d3c2ae1c
GW
8948
8949 multilist_sublist_unlock(mls);
8950
8951 if (full == B_TRUE)
8952 break;
34dc7c2f 8953 }
34dc7c2f 8954
d3c2ae1c
GW
8955 /* No buffers selected for writing? */
8956 if (pio == NULL) {
01850391 8957 ASSERT0(write_lsize);
d3c2ae1c
GW
8958 ASSERT(!HDR_HAS_L1HDR(head));
8959 kmem_cache_free(hdr_l2only_cache, head);
8960 return (0);
8961 }
34dc7c2f 8962
3a17a7a9 8963 ASSERT3U(write_asize, <=, target_sz);
34dc7c2f 8964 ARCSTAT_BUMP(arcstat_l2_writes_sent);
01850391
AG
8965 ARCSTAT_INCR(arcstat_l2_write_bytes, write_psize);
8966 ARCSTAT_INCR(arcstat_l2_lsize, write_lsize);
8967 ARCSTAT_INCR(arcstat_l2_psize, write_psize);
8968 vdev_space_update(dev->l2ad_vdev, write_psize, 0, 0);
34dc7c2f
BB
8969
8970 /*
8971 * Bump device hand to the device start if it is approaching the end.
8972 * l2arc_evict() will already have evicted ahead for this case.
8973 */
b128c09f 8974 if (dev->l2ad_hand >= (dev->l2ad_end - target_sz)) {
34dc7c2f 8975 dev->l2ad_hand = dev->l2ad_start;
34dc7c2f
BB
8976 dev->l2ad_first = B_FALSE;
8977 }
8978
d164b209 8979 dev->l2ad_writing = B_TRUE;
34dc7c2f 8980 (void) zio_wait(pio);
d164b209
BB
8981 dev->l2ad_writing = B_FALSE;
8982
3a17a7a9
SK
8983 return (write_asize);
8984}
8985
34dc7c2f
BB
8986/*
8987 * This thread feeds the L2ARC at regular intervals. This is the beating
8988 * heart of the L2ARC.
8989 */
867959b5 8990/* ARGSUSED */
34dc7c2f 8991static void
c25b8f99 8992l2arc_feed_thread(void *unused)
34dc7c2f
BB
8993{
8994 callb_cpr_t cpr;
8995 l2arc_dev_t *dev;
8996 spa_t *spa;
d164b209 8997 uint64_t size, wrote;
428870ff 8998 clock_t begin, next = ddi_get_lbolt();
40d06e3c 8999 fstrans_cookie_t cookie;
34dc7c2f
BB
9000
9001 CALLB_CPR_INIT(&cpr, &l2arc_feed_thr_lock, callb_generic_cpr, FTAG);
9002
9003 mutex_enter(&l2arc_feed_thr_lock);
9004
40d06e3c 9005 cookie = spl_fstrans_mark();
34dc7c2f 9006 while (l2arc_thread_exit == 0) {
34dc7c2f 9007 CALLB_CPR_SAFE_BEGIN(&cpr);
b64ccd6c 9008 (void) cv_timedwait_sig(&l2arc_feed_thr_cv,
5b63b3eb 9009 &l2arc_feed_thr_lock, next);
34dc7c2f 9010 CALLB_CPR_SAFE_END(&cpr, &l2arc_feed_thr_lock);
428870ff 9011 next = ddi_get_lbolt() + hz;
34dc7c2f
BB
9012
9013 /*
b128c09f 9014 * Quick check for L2ARC devices.
34dc7c2f
BB
9015 */
9016 mutex_enter(&l2arc_dev_mtx);
9017 if (l2arc_ndev == 0) {
9018 mutex_exit(&l2arc_dev_mtx);
9019 continue;
9020 }
b128c09f 9021 mutex_exit(&l2arc_dev_mtx);
428870ff 9022 begin = ddi_get_lbolt();
34dc7c2f
BB
9023
9024 /*
b128c09f
BB
9025 * This selects the next l2arc device to write to, and in
9026 * doing so the next spa to feed from: dev->l2ad_spa. This
9027 * will return NULL if there are now no l2arc devices or if
9028 * they are all faulted.
9029 *
9030 * If a device is returned, its spa's config lock is also
9031 * held to prevent device removal. l2arc_dev_get_next()
9032 * will grab and release l2arc_dev_mtx.
34dc7c2f 9033 */
b128c09f 9034 if ((dev = l2arc_dev_get_next()) == NULL)
34dc7c2f 9035 continue;
b128c09f
BB
9036
9037 spa = dev->l2ad_spa;
d3c2ae1c 9038 ASSERT3P(spa, !=, NULL);
34dc7c2f 9039
572e2857
BB
9040 /*
9041 * If the pool is read-only then force the feed thread to
9042 * sleep a little longer.
9043 */
9044 if (!spa_writeable(spa)) {
9045 next = ddi_get_lbolt() + 5 * l2arc_feed_secs * hz;
9046 spa_config_exit(spa, SCL_L2ARC, dev);
9047 continue;
9048 }
9049
34dc7c2f 9050 /*
b128c09f 9051 * Avoid contributing to memory pressure.
34dc7c2f 9052 */
ca67b33a 9053 if (arc_reclaim_needed()) {
b128c09f
BB
9054 ARCSTAT_BUMP(arcstat_l2_abort_lowmem);
9055 spa_config_exit(spa, SCL_L2ARC, dev);
34dc7c2f
BB
9056 continue;
9057 }
b128c09f 9058
34dc7c2f
BB
9059 ARCSTAT_BUMP(arcstat_l2_feeds);
9060
3a17a7a9 9061 size = l2arc_write_size();
b128c09f 9062
34dc7c2f
BB
9063 /*
9064 * Evict L2ARC buffers that will be overwritten.
9065 */
b128c09f 9066 l2arc_evict(dev, size, B_FALSE);
34dc7c2f
BB
9067
9068 /*
9069 * Write ARC buffers.
9070 */
d3c2ae1c 9071 wrote = l2arc_write_buffers(spa, dev, size);
d164b209
BB
9072
9073 /*
9074 * Calculate interval between writes.
9075 */
9076 next = l2arc_write_interval(begin, size, wrote);
b128c09f 9077 spa_config_exit(spa, SCL_L2ARC, dev);
34dc7c2f 9078 }
40d06e3c 9079 spl_fstrans_unmark(cookie);
34dc7c2f
BB
9080
9081 l2arc_thread_exit = 0;
9082 cv_broadcast(&l2arc_feed_thr_cv);
9083 CALLB_CPR_EXIT(&cpr); /* drops l2arc_feed_thr_lock */
9084 thread_exit();
9085}
9086
b128c09f
BB
9087boolean_t
9088l2arc_vdev_present(vdev_t *vd)
9089{
9090 l2arc_dev_t *dev;
9091
9092 mutex_enter(&l2arc_dev_mtx);
9093 for (dev = list_head(l2arc_dev_list); dev != NULL;
9094 dev = list_next(l2arc_dev_list, dev)) {
9095 if (dev->l2ad_vdev == vd)
9096 break;
9097 }
9098 mutex_exit(&l2arc_dev_mtx);
9099
9100 return (dev != NULL);
9101}
9102
34dc7c2f
BB
9103/*
9104 * Add a vdev for use by the L2ARC. By this point the spa has already
9105 * validated the vdev and opened it.
9106 */
9107void
9babb374 9108l2arc_add_vdev(spa_t *spa, vdev_t *vd)
34dc7c2f
BB
9109{
9110 l2arc_dev_t *adddev;
9111
b128c09f
BB
9112 ASSERT(!l2arc_vdev_present(vd));
9113
34dc7c2f
BB
9114 /*
9115 * Create a new l2arc device entry.
9116 */
9117 adddev = kmem_zalloc(sizeof (l2arc_dev_t), KM_SLEEP);
9118 adddev->l2ad_spa = spa;
9119 adddev->l2ad_vdev = vd;
9babb374
BB
9120 adddev->l2ad_start = VDEV_LABEL_START_SIZE;
9121 adddev->l2ad_end = VDEV_LABEL_START_SIZE + vdev_get_min_asize(vd);
34dc7c2f 9122 adddev->l2ad_hand = adddev->l2ad_start;
34dc7c2f 9123 adddev->l2ad_first = B_TRUE;
d164b209 9124 adddev->l2ad_writing = B_FALSE;
98f72a53 9125 list_link_init(&adddev->l2ad_node);
34dc7c2f 9126
b9541d6b 9127 mutex_init(&adddev->l2ad_mtx, NULL, MUTEX_DEFAULT, NULL);
34dc7c2f
BB
9128 /*
9129 * This is a list of all ARC buffers that are still valid on the
9130 * device.
9131 */
b9541d6b
CW
9132 list_create(&adddev->l2ad_buflist, sizeof (arc_buf_hdr_t),
9133 offsetof(arc_buf_hdr_t, b_l2hdr.b_l2node));
34dc7c2f 9134
428870ff 9135 vdev_space_update(vd, 0, 0, adddev->l2ad_end - adddev->l2ad_hand);
d962d5da 9136 refcount_create(&adddev->l2ad_alloc);
34dc7c2f
BB
9137
9138 /*
9139 * Add device to global list
9140 */
9141 mutex_enter(&l2arc_dev_mtx);
9142 list_insert_head(l2arc_dev_list, adddev);
9143 atomic_inc_64(&l2arc_ndev);
9144 mutex_exit(&l2arc_dev_mtx);
9145}
9146
9147/*
9148 * Remove a vdev from the L2ARC.
9149 */
9150void
9151l2arc_remove_vdev(vdev_t *vd)
9152{
9153 l2arc_dev_t *dev, *nextdev, *remdev = NULL;
9154
34dc7c2f
BB
9155 /*
9156 * Find the device by vdev
9157 */
9158 mutex_enter(&l2arc_dev_mtx);
9159 for (dev = list_head(l2arc_dev_list); dev; dev = nextdev) {
9160 nextdev = list_next(l2arc_dev_list, dev);
9161 if (vd == dev->l2ad_vdev) {
9162 remdev = dev;
9163 break;
9164 }
9165 }
d3c2ae1c 9166 ASSERT3P(remdev, !=, NULL);
34dc7c2f
BB
9167
9168 /*
9169 * Remove device from global list
9170 */
9171 list_remove(l2arc_dev_list, remdev);
9172 l2arc_dev_last = NULL; /* may have been invalidated */
b128c09f
BB
9173 atomic_dec_64(&l2arc_ndev);
9174 mutex_exit(&l2arc_dev_mtx);
34dc7c2f
BB
9175
9176 /*
9177 * Clear all buflists and ARC references. L2ARC device flush.
9178 */
9179 l2arc_evict(remdev, 0, B_TRUE);
b9541d6b
CW
9180 list_destroy(&remdev->l2ad_buflist);
9181 mutex_destroy(&remdev->l2ad_mtx);
d962d5da 9182 refcount_destroy(&remdev->l2ad_alloc);
34dc7c2f 9183 kmem_free(remdev, sizeof (l2arc_dev_t));
34dc7c2f
BB
9184}
9185
9186void
b128c09f 9187l2arc_init(void)
34dc7c2f
BB
9188{
9189 l2arc_thread_exit = 0;
9190 l2arc_ndev = 0;
9191 l2arc_writes_sent = 0;
9192 l2arc_writes_done = 0;
9193
9194 mutex_init(&l2arc_feed_thr_lock, NULL, MUTEX_DEFAULT, NULL);
9195 cv_init(&l2arc_feed_thr_cv, NULL, CV_DEFAULT, NULL);
9196 mutex_init(&l2arc_dev_mtx, NULL, MUTEX_DEFAULT, NULL);
34dc7c2f
BB
9197 mutex_init(&l2arc_free_on_write_mtx, NULL, MUTEX_DEFAULT, NULL);
9198
9199 l2arc_dev_list = &L2ARC_dev_list;
9200 l2arc_free_on_write = &L2ARC_free_on_write;
9201 list_create(l2arc_dev_list, sizeof (l2arc_dev_t),
9202 offsetof(l2arc_dev_t, l2ad_node));
9203 list_create(l2arc_free_on_write, sizeof (l2arc_data_free_t),
9204 offsetof(l2arc_data_free_t, l2df_list_node));
34dc7c2f
BB
9205}
9206
9207void
b128c09f 9208l2arc_fini(void)
34dc7c2f 9209{
b128c09f
BB
9210 /*
9211 * This is called from dmu_fini(), which is called from spa_fini();
9212 * Because of this, we can assume that all l2arc devices have
9213 * already been removed when the pools themselves were removed.
9214 */
9215
9216 l2arc_do_free_on_write();
34dc7c2f
BB
9217
9218 mutex_destroy(&l2arc_feed_thr_lock);
9219 cv_destroy(&l2arc_feed_thr_cv);
9220 mutex_destroy(&l2arc_dev_mtx);
34dc7c2f
BB
9221 mutex_destroy(&l2arc_free_on_write_mtx);
9222
9223 list_destroy(l2arc_dev_list);
9224 list_destroy(l2arc_free_on_write);
9225}
b128c09f
BB
9226
9227void
9228l2arc_start(void)
9229{
fb5f0bc8 9230 if (!(spa_mode_global & FWRITE))
b128c09f
BB
9231 return;
9232
9233 (void) thread_create(NULL, 0, l2arc_feed_thread, NULL, 0, &p0,
1229323d 9234 TS_RUN, defclsyspri);
b128c09f
BB
9235}
9236
9237void
9238l2arc_stop(void)
9239{
fb5f0bc8 9240 if (!(spa_mode_global & FWRITE))
b128c09f
BB
9241 return;
9242
9243 mutex_enter(&l2arc_feed_thr_lock);
9244 cv_signal(&l2arc_feed_thr_cv); /* kick thread out of startup */
9245 l2arc_thread_exit = 1;
9246 while (l2arc_thread_exit != 0)
9247 cv_wait(&l2arc_feed_thr_cv, &l2arc_feed_thr_lock);
9248 mutex_exit(&l2arc_feed_thr_lock);
9249}
c28b2279 9250
93ce2b4c 9251#if defined(_KERNEL)
0f699108
AZ
9252EXPORT_SYMBOL(arc_buf_size);
9253EXPORT_SYMBOL(arc_write);
c28b2279 9254EXPORT_SYMBOL(arc_read);
e0b0ca98 9255EXPORT_SYMBOL(arc_buf_info);
c28b2279 9256EXPORT_SYMBOL(arc_getbuf_func);
ab26409d
BB
9257EXPORT_SYMBOL(arc_add_prune_callback);
9258EXPORT_SYMBOL(arc_remove_prune_callback);
c28b2279 9259
02730c33 9260/* BEGIN CSTYLED */
bce45ec9 9261module_param(zfs_arc_min, ulong, 0644);
c409e464 9262MODULE_PARM_DESC(zfs_arc_min, "Min arc size");
c28b2279 9263
bce45ec9 9264module_param(zfs_arc_max, ulong, 0644);
c409e464 9265MODULE_PARM_DESC(zfs_arc_max, "Max arc size");
c28b2279 9266
bce45ec9 9267module_param(zfs_arc_meta_limit, ulong, 0644);
c28b2279 9268MODULE_PARM_DESC(zfs_arc_meta_limit, "Meta limit for arc size");
6a8f9b6b 9269
9907cc1c
G
9270module_param(zfs_arc_meta_limit_percent, ulong, 0644);
9271MODULE_PARM_DESC(zfs_arc_meta_limit_percent,
9272 "Percent of arc size for arc meta limit");
9273
ca0bf58d
PS
9274module_param(zfs_arc_meta_min, ulong, 0644);
9275MODULE_PARM_DESC(zfs_arc_meta_min, "Min arc metadata");
9276
bce45ec9 9277module_param(zfs_arc_meta_prune, int, 0644);
2cbb06b5 9278MODULE_PARM_DESC(zfs_arc_meta_prune, "Meta objects to scan for prune");
c409e464 9279
ca67b33a 9280module_param(zfs_arc_meta_adjust_restarts, int, 0644);
bc888666
BB
9281MODULE_PARM_DESC(zfs_arc_meta_adjust_restarts,
9282 "Limit number of restarts in arc_adjust_meta");
9283
f6046738
BB
9284module_param(zfs_arc_meta_strategy, int, 0644);
9285MODULE_PARM_DESC(zfs_arc_meta_strategy, "Meta reclaim strategy");
9286
bce45ec9 9287module_param(zfs_arc_grow_retry, int, 0644);
c409e464
BB
9288MODULE_PARM_DESC(zfs_arc_grow_retry, "Seconds before growing arc size");
9289
62422785
PS
9290module_param(zfs_arc_p_dampener_disable, int, 0644);
9291MODULE_PARM_DESC(zfs_arc_p_dampener_disable, "disable arc_p adapt dampener");
9292
bce45ec9 9293module_param(zfs_arc_shrink_shift, int, 0644);
c409e464
BB
9294MODULE_PARM_DESC(zfs_arc_shrink_shift, "log2(fraction of arc to reclaim)");
9295
03b60eee
DB
9296module_param(zfs_arc_pc_percent, uint, 0644);
9297MODULE_PARM_DESC(zfs_arc_pc_percent,
9298 "Percent of pagecache to reclaim arc to");
9299
728d6ae9
BB
9300module_param(zfs_arc_p_min_shift, int, 0644);
9301MODULE_PARM_DESC(zfs_arc_p_min_shift, "arc_c shift to calc min/max arc_p");
9302
49ddb315
MA
9303module_param(zfs_arc_average_blocksize, int, 0444);
9304MODULE_PARM_DESC(zfs_arc_average_blocksize, "Target average block size");
9305
d3c2ae1c 9306module_param(zfs_compressed_arc_enabled, int, 0644);
544596c5 9307MODULE_PARM_DESC(zfs_compressed_arc_enabled, "Disable compressed arc buffers");
d3c2ae1c 9308
d4a72f23
TC
9309module_param(zfs_arc_min_prefetch_ms, int, 0644);
9310MODULE_PARM_DESC(zfs_arc_min_prefetch_ms, "Min life of prefetch block in ms");
9311
9312module_param(zfs_arc_min_prescient_prefetch_ms, int, 0644);
9313MODULE_PARM_DESC(zfs_arc_min_prescient_prefetch_ms,
9314 "Min life of prescient prefetched block in ms");
bce45ec9
BB
9315
9316module_param(l2arc_write_max, ulong, 0644);
abd8610c
BB
9317MODULE_PARM_DESC(l2arc_write_max, "Max write bytes per interval");
9318
bce45ec9 9319module_param(l2arc_write_boost, ulong, 0644);
abd8610c
BB
9320MODULE_PARM_DESC(l2arc_write_boost, "Extra write bytes during device warmup");
9321
bce45ec9 9322module_param(l2arc_headroom, ulong, 0644);
abd8610c
BB
9323MODULE_PARM_DESC(l2arc_headroom, "Number of max device writes to precache");
9324
3a17a7a9
SK
9325module_param(l2arc_headroom_boost, ulong, 0644);
9326MODULE_PARM_DESC(l2arc_headroom_boost, "Compressed l2arc_headroom multiplier");
9327
bce45ec9 9328module_param(l2arc_feed_secs, ulong, 0644);
abd8610c
BB
9329MODULE_PARM_DESC(l2arc_feed_secs, "Seconds between L2ARC writing");
9330
bce45ec9 9331module_param(l2arc_feed_min_ms, ulong, 0644);
abd8610c
BB
9332MODULE_PARM_DESC(l2arc_feed_min_ms, "Min feed interval in milliseconds");
9333
bce45ec9 9334module_param(l2arc_noprefetch, int, 0644);
abd8610c
BB
9335MODULE_PARM_DESC(l2arc_noprefetch, "Skip caching prefetched buffers");
9336
bce45ec9 9337module_param(l2arc_feed_again, int, 0644);
abd8610c
BB
9338MODULE_PARM_DESC(l2arc_feed_again, "Turbo L2ARC warmup");
9339
bce45ec9 9340module_param(l2arc_norw, int, 0644);
abd8610c
BB
9341MODULE_PARM_DESC(l2arc_norw, "No reads during writes");
9342
7e8bddd0
BB
9343module_param(zfs_arc_lotsfree_percent, int, 0644);
9344MODULE_PARM_DESC(zfs_arc_lotsfree_percent,
9345 "System free memory I/O throttle in bytes");
9346
11f552fa
BB
9347module_param(zfs_arc_sys_free, ulong, 0644);
9348MODULE_PARM_DESC(zfs_arc_sys_free, "System free memory target size in bytes");
9349
25458cbe
TC
9350module_param(zfs_arc_dnode_limit, ulong, 0644);
9351MODULE_PARM_DESC(zfs_arc_dnode_limit, "Minimum bytes of dnodes in arc");
9352
9907cc1c
G
9353module_param(zfs_arc_dnode_limit_percent, ulong, 0644);
9354MODULE_PARM_DESC(zfs_arc_dnode_limit_percent,
9355 "Percent of ARC meta buffers for dnodes");
9356
25458cbe
TC
9357module_param(zfs_arc_dnode_reduce_percent, ulong, 0644);
9358MODULE_PARM_DESC(zfs_arc_dnode_reduce_percent,
9359 "Percentage of excess dnodes to try to unpin");
02730c33 9360/* END CSTYLED */
c28b2279 9361#endif