]> git.proxmox.com Git - mirror_zfs.git/blame - include/sys/arc_impl.h
Fix/improve dbuf hits accounting
[mirror_zfs.git] / include / sys / arc_impl.h
CommitLineData
59ec819a
NB
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21/*
22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
77f6826b
GA
23 * Copyright (c) 2013, Delphix. All rights reserved.
24 * Copyright (c) 2013, Saso Kiselkov. All rights reserved.
25 * Copyright (c) 2013, Nexenta Systems, Inc. All rights reserved.
26 * Copyright (c) 2020, George Amanakis. All rights reserved.
59ec819a
NB
27 */
28
29#ifndef _SYS_ARC_IMPL_H
30#define _SYS_ARC_IMPL_H
31
32#include <sys/arc.h>
b5256303 33#include <sys/zio_crypt.h>
c9c9c1e2
MM
34#include <sys/zthr.h>
35#include <sys/aggsum.h>
c4c162c1 36#include <sys/wmsum.h>
59ec819a
NB
37
38#ifdef __cplusplus
39extern "C" {
40#endif
41
42/*
43 * Note that buffers can be in one of 6 states:
44 * ARC_anon - anonymous (discussed below)
45 * ARC_mru - recently used, currently cached
cf7c5a03 46 * ARC_mru_ghost - recently used, no longer in cache
59ec819a
NB
47 * ARC_mfu - frequently used, currently cached
48 * ARC_mfu_ghost - frequently used, no longer in cache
49 * ARC_l2c_only - exists in L2ARC but not other states
50 * When there are no active references to the buffer, they are
51 * are linked onto a list in one of these arc states. These are
52 * the only buffers that can be evicted or deleted. Within each
53 * state there are multiple lists, one for meta-data and one for
54 * non-meta-data. Meta-data (indirect blocks, blocks of dnodes,
55 * etc.) is tracked separately so that it can be managed more
56 * explicitly: favored over data, limited explicitly.
57 *
58 * Anonymous buffers are buffers that are not associated with
59 * a DVA. These are buffers that hold dirty block copies
60 * before they are written to stable storage. By definition,
61 * they are "ref'd" and are considered part of arc_mru
4e33ba4c 62 * that cannot be freed. Generally, they will acquire a DVA
59ec819a
NB
63 * as they are written and migrate onto the arc_mru list.
64 *
65 * The ARC_l2c_only state is for buffers that are in the second
66 * level ARC but no longer in any of the ARC_m* lists. The second
67 * level ARC itself may also contain buffers that are in any of
68 * the ARC_m* states - meaning that a buffer can exist in two
69 * places. The reason for the ARC_l2c_only state is to keep the
70 * buffer header in the hash table, so that reads that hit the
71 * second level ARC benefit from these fast lookups.
72 */
73
74typedef struct arc_state {
ca0bf58d
PS
75 /*
76 * list of evictable buffers
77 */
ffdf019c
AM
78 multilist_t arcs_list[ARC_BUFC_NUMTYPES];
79 /*
80 * supports the "dbufs" kstat
81 */
82 arc_state_type_t arcs_state;
ca0bf58d
PS
83 /*
84 * total amount of evictable data in this state
85 */
ffdf019c 86 zfs_refcount_t arcs_esize[ARC_BUFC_NUMTYPES] ____cacheline_aligned;
ca0bf58d
PS
87 /*
88 * total amount of data in this state; this includes: evictable,
89 * non-evictable, ARC_BUFC_DATA, and ARC_BUFC_METADATA.
90 */
c13060e4 91 zfs_refcount_t arcs_size;
59ec819a
NB
92} arc_state_t;
93
59ec819a
NB
94typedef struct arc_callback arc_callback_t;
95
96struct arc_callback {
97 void *acb_private;
b5256303 98 arc_read_done_func_t *acb_done;
59ec819a 99 arc_buf_t *acb_buf;
b5256303 100 boolean_t acb_encrypted;
2aa34383 101 boolean_t acb_compressed;
b5256303 102 boolean_t acb_noauth;
923d7303 103 boolean_t acb_nobuf;
be9a5c35 104 zbookmark_phys_t acb_zb;
59ec819a 105 zio_t *acb_zio_dummy;
a8b2e306 106 zio_t *acb_zio_head;
59ec819a
NB
107 arc_callback_t *acb_next;
108};
109
110typedef struct arc_write_callback arc_write_callback_t;
111
112struct arc_write_callback {
b5256303
TC
113 void *awcb_private;
114 arc_write_done_func_t *awcb_ready;
115 arc_write_done_func_t *awcb_children_ready;
116 arc_write_done_func_t *awcb_physdone;
117 arc_write_done_func_t *awcb_done;
118 arc_buf_t *awcb_buf;
59ec819a
NB
119};
120
b9541d6b
CW
121/*
122 * ARC buffers are separated into multiple structs as a memory saving measure:
123 * - Common fields struct, always defined, and embedded within it:
124 * - L2-only fields, always allocated but undefined when not in L2ARC
125 * - L1-only fields, only allocated when in L1ARC
126 *
127 * Buffer in L1 Buffer only in L2
128 * +------------------------+ +------------------------+
129 * | arc_buf_hdr_t | | arc_buf_hdr_t |
130 * | | | |
131 * | | | |
132 * | | | |
133 * +------------------------+ +------------------------+
134 * | l2arc_buf_hdr_t | | l2arc_buf_hdr_t |
135 * | (undefined if L1-only) | | |
136 * +------------------------+ +------------------------+
137 * | l1arc_buf_hdr_t |
138 * | |
139 * | |
140 * | |
141 * | |
142 * +------------------------+
143 *
144 * Because it's possible for the L2ARC to become extremely large, we can wind
145 * up eating a lot of memory in L2ARC buffer headers, so the size of a header
146 * is minimized by only allocating the fields necessary for an L1-cached buffer
147 * when a header is actually in the L1 cache. The sub-headers (l1arc_buf_hdr and
148 * l2arc_buf_hdr) are embedded rather than allocated separately to save a couple
149 * words in pointers. arc_hdr_realloc() is used to switch a header between
150 * these two allocation states.
151 */
152typedef struct l1arc_buf_hdr {
59ec819a 153 kmutex_t b_freeze_lock;
d3c2ae1c 154 zio_cksum_t *b_freeze_cksum;
59ec819a 155
cfe8e960 156 /* for waiting on reads to complete */
59ec819a 157 kcondvar_t b_cv;
d3c2ae1c 158 uint8_t b_byteswap;
59ec819a 159
59ec819a
NB
160 /* protected by arc state mutex */
161 arc_state_t *b_state;
ca0bf58d 162 multilist_node_t b_arc_node;
59ec819a 163
cfe8e960 164 /* protected by hash lock */
59ec819a
NB
165 clock_t b_arc_access;
166 uint32_t b_mru_hits;
167 uint32_t b_mru_ghost_hits;
168 uint32_t b_mfu_hits;
169 uint32_t b_mfu_ghost_hits;
cfe8e960
AM
170 uint32_t b_bufcnt;
171 arc_buf_t *b_buf;
59ec819a
NB
172
173 /* self protecting */
c13060e4 174 zfs_refcount_t b_refcnt;
59ec819a 175
b9541d6b 176 arc_callback_t *b_acb;
a6255b7f 177 abd_t *b_pabd;
b9541d6b 178} l1arc_buf_hdr_t;
59ec819a 179
77f6826b
GA
180typedef enum l2arc_dev_hdr_flags_t {
181 L2ARC_DEV_HDR_EVICT_FIRST = (1 << 0) /* mirror of l2ad_first */
182} l2arc_dev_hdr_flags_t;
183
184/*
185 * Pointer used in persistent L2ARC (for pointing to log blocks).
186 */
187typedef struct l2arc_log_blkptr {
188 /*
189 * Offset of log block within the device, in bytes
190 */
191 uint64_t lbp_daddr;
192 /*
193 * Aligned payload size (in bytes) of the log block
194 */
195 uint64_t lbp_payload_asize;
196 /*
197 * Offset in bytes of the first buffer in the payload
198 */
199 uint64_t lbp_payload_start;
200 /*
201 * lbp_prop has the following format:
202 * * logical size (in bytes)
657fd33b 203 * * aligned (after compression) size (in bytes)
77f6826b
GA
204 * * compression algorithm (we always LZ4-compress l2arc logs)
205 * * checksum algorithm (used for lbp_cksum)
206 */
207 uint64_t lbp_prop;
208 zio_cksum_t lbp_cksum; /* checksum of log */
209} l2arc_log_blkptr_t;
210
211/*
212 * The persistent L2ARC device header.
213 * Byte order of magic determines whether 64-bit bswap of fields is necessary.
214 */
215typedef struct l2arc_dev_hdr_phys {
216 uint64_t dh_magic; /* L2ARC_DEV_HDR_MAGIC */
217 uint64_t dh_version; /* Persistent L2ARC version */
218
219 /*
220 * Global L2ARC device state and metadata.
221 */
222 uint64_t dh_spa_guid;
223 uint64_t dh_vdev_guid;
657fd33b 224 uint64_t dh_log_entries; /* mirror of l2ad_log_entries */
77f6826b
GA
225 uint64_t dh_evict; /* evicted offset in bytes */
226 uint64_t dh_flags; /* l2arc_dev_hdr_flags_t */
227 /*
228 * Used in zdb.c for determining if a log block is valid, in the same
229 * way that l2arc_rebuild() does.
230 */
657fd33b
GA
231 uint64_t dh_start; /* mirror of l2ad_start */
232 uint64_t dh_end; /* mirror of l2ad_end */
77f6826b
GA
233 /*
234 * Start of log block chain. [0] -> newest log, [1] -> one older (used
235 * for initiating prefetch).
236 */
237 l2arc_log_blkptr_t dh_start_lbps[2];
657fd33b
GA
238 /*
239 * Aligned size of all log blocks as accounted by vdev_space_update().
240 */
241 uint64_t dh_lb_asize; /* mirror of l2ad_lb_asize */
242 uint64_t dh_lb_count; /* mirror of l2ad_lb_count */
b7654bd7
GA
243 /*
244 * Mirrors of vdev_trim_action_time and vdev_trim_state, used to
245 * display when the cache device was fully trimmed for the last
246 * time.
247 */
248 uint64_t dh_trim_action_time;
249 uint64_t dh_trim_state;
250 const uint64_t dh_pad[30]; /* pad to 512 bytes */
77f6826b
GA
251 zio_eck_t dh_tail;
252} l2arc_dev_hdr_phys_t;
253CTASSERT_GLOBAL(sizeof (l2arc_dev_hdr_phys_t) == SPA_MINBLOCKSIZE);
254
255/*
256 * A single ARC buffer header entry in a l2arc_log_blk_phys_t.
257 */
258typedef struct l2arc_log_ent_phys {
259 dva_t le_dva; /* dva of buffer */
260 uint64_t le_birth; /* birth txg of buffer */
261 /*
262 * le_prop has the following format:
263 * * logical size (in bytes)
264 * * physical (compressed) size (in bytes)
265 * * compression algorithm
266 * * object type (used to restore arc_buf_contents_t)
267 * * protected status (used for encryption)
268 * * prefetch status (used in l2arc_read_done())
269 */
270 uint64_t le_prop;
271 uint64_t le_daddr; /* buf location on l2dev */
10b3c7f5 272 uint64_t le_complevel;
77f6826b
GA
273 /*
274 * We pad the size of each entry to a power of 2 so that the size of
275 * l2arc_log_blk_phys_t is power-of-2 aligned with SPA_MINBLOCKSHIFT,
276 * because of the L2ARC_SET_*SIZE macros.
277 */
10b3c7f5 278 const uint64_t le_pad[2]; /* pad to 64 bytes */
77f6826b
GA
279} l2arc_log_ent_phys_t;
280
281#define L2ARC_LOG_BLK_MAX_ENTRIES (1022)
282
283/*
284 * A log block of up to 1022 ARC buffer log entries, chained into the
285 * persistent L2ARC metadata linked list. Byte order of magic determines
286 * whether 64-bit bswap of fields is necessary.
287 */
288typedef struct l2arc_log_blk_phys {
289 uint64_t lb_magic; /* L2ARC_LOG_BLK_MAGIC */
290 /*
291 * There are 2 chains (headed by dh_start_lbps[2]), and this field
292 * points back to the previous block in this chain. We alternate
293 * which chain we append to, so they are time-wise and offset-wise
294 * interleaved, but that is an optimization rather than for
295 * correctness.
296 */
297 l2arc_log_blkptr_t lb_prev_lbp; /* pointer to prev log block */
298 /*
299 * Pad header section to 128 bytes
300 */
301 uint64_t lb_pad[7];
302 /* Payload */
303 l2arc_log_ent_phys_t lb_entries[L2ARC_LOG_BLK_MAX_ENTRIES];
304} l2arc_log_blk_phys_t; /* 64K total */
305
306/*
307 * The size of l2arc_log_blk_phys_t has to be power-of-2 aligned with
308 * SPA_MINBLOCKSHIFT because of L2BLK_SET_*SIZE macros.
309 */
310CTASSERT_GLOBAL(IS_P2ALIGNED(sizeof (l2arc_log_blk_phys_t),
311 1ULL << SPA_MINBLOCKSHIFT));
312CTASSERT_GLOBAL(sizeof (l2arc_log_blk_phys_t) >= SPA_MINBLOCKSIZE);
313CTASSERT_GLOBAL(sizeof (l2arc_log_blk_phys_t) <= SPA_MAXBLOCKSIZE);
314
315/*
316 * These structures hold in-flight abd buffers for log blocks as they're being
317 * written to the L2ARC device.
318 */
319typedef struct l2arc_lb_abd_buf {
320 abd_t *abd;
321 list_node_t node;
322} l2arc_lb_abd_buf_t;
323
324/*
325 * These structures hold pointers to log blocks present on the L2ARC device.
326 */
327typedef struct l2arc_lb_ptr_buf {
328 l2arc_log_blkptr_t *lb_ptr;
329 list_node_t node;
330} l2arc_lb_ptr_buf_t;
331
332/* Macros for setting fields in le_prop and lbp_prop */
333#define L2BLK_GET_LSIZE(field) \
334 BF64_GET_SB((field), 0, SPA_LSIZEBITS, SPA_MINBLOCKSHIFT, 1)
335#define L2BLK_SET_LSIZE(field, x) \
336 BF64_SET_SB((field), 0, SPA_LSIZEBITS, SPA_MINBLOCKSHIFT, 1, x)
337#define L2BLK_GET_PSIZE(field) \
338 BF64_GET_SB((field), 16, SPA_PSIZEBITS, SPA_MINBLOCKSHIFT, 1)
339#define L2BLK_SET_PSIZE(field, x) \
340 BF64_SET_SB((field), 16, SPA_PSIZEBITS, SPA_MINBLOCKSHIFT, 1, x)
341#define L2BLK_GET_COMPRESS(field) \
342 BF64_GET((field), 32, SPA_COMPRESSBITS)
343#define L2BLK_SET_COMPRESS(field, x) \
344 BF64_SET((field), 32, SPA_COMPRESSBITS, x)
345#define L2BLK_GET_PREFETCH(field) BF64_GET((field), 39, 1)
346#define L2BLK_SET_PREFETCH(field, x) BF64_SET((field), 39, 1, x)
347#define L2BLK_GET_CHECKSUM(field) BF64_GET((field), 40, 8)
348#define L2BLK_SET_CHECKSUM(field, x) BF64_SET((field), 40, 8, x)
349#define L2BLK_GET_TYPE(field) BF64_GET((field), 48, 8)
350#define L2BLK_SET_TYPE(field, x) BF64_SET((field), 48, 8, x)
351#define L2BLK_GET_PROTECTED(field) BF64_GET((field), 56, 1)
352#define L2BLK_SET_PROTECTED(field, x) BF64_SET((field), 56, 1, x)
08532162
GA
353#define L2BLK_GET_STATE(field) BF64_GET((field), 57, 4)
354#define L2BLK_SET_STATE(field, x) BF64_SET((field), 57, 4, x)
77f6826b
GA
355
356#define PTR_SWAP(x, y) \
357 do { \
358 void *tmp = (x);\
359 x = y; \
360 y = tmp; \
77f6826b
GA
361 } while (0)
362
363#define L2ARC_DEV_HDR_MAGIC 0x5a46534341434845LLU /* ASCII: "ZFSCACHE" */
364#define L2ARC_LOG_BLK_MAGIC 0x4c4f47424c4b4844LLU /* ASCII: "LOGBLKHD" */
365
366/*
367 * L2ARC Internals
368 */
369typedef struct l2arc_dev {
370 vdev_t *l2ad_vdev; /* vdev */
371 spa_t *l2ad_spa; /* spa */
372 uint64_t l2ad_hand; /* next write location */
373 uint64_t l2ad_start; /* first addr on device */
374 uint64_t l2ad_end; /* last addr on device */
375 boolean_t l2ad_first; /* first sweep through */
376 boolean_t l2ad_writing; /* currently writing */
377 kmutex_t l2ad_mtx; /* lock for buffer list */
378 list_t l2ad_buflist; /* buffer list */
379 list_node_t l2ad_node; /* device list node */
380 zfs_refcount_t l2ad_alloc; /* allocated bytes */
381 /*
382 * Persistence-related stuff
383 */
384 l2arc_dev_hdr_phys_t *l2ad_dev_hdr; /* persistent device header */
385 uint64_t l2ad_dev_hdr_asize; /* aligned hdr size */
386 l2arc_log_blk_phys_t l2ad_log_blk; /* currently open log block */
387 int l2ad_log_ent_idx; /* index into cur log blk */
388 /* Number of bytes in current log block's payload */
389 uint64_t l2ad_log_blk_payload_asize;
390 /*
391 * Offset (in bytes) of the first buffer in current log block's
392 * payload.
393 */
394 uint64_t l2ad_log_blk_payload_start;
395 /* Flag indicating whether a rebuild is scheduled or is going on */
396 boolean_t l2ad_rebuild;
397 boolean_t l2ad_rebuild_cancel;
398 boolean_t l2ad_rebuild_began;
399 uint64_t l2ad_log_entries; /* entries per log blk */
400 uint64_t l2ad_evict; /* evicted offset in bytes */
401 /* List of pointers to log blocks present in the L2ARC device */
402 list_t l2ad_lbptr_list;
657fd33b
GA
403 /*
404 * Aligned size of all log blocks as accounted by vdev_space_update().
405 */
406 zfs_refcount_t l2ad_lb_asize;
407 /*
408 * Number of log blocks present on the device.
409 */
410 zfs_refcount_t l2ad_lb_count;
b7654bd7 411 boolean_t l2ad_trim_all; /* TRIM whole device */
77f6826b
GA
412} l2arc_dev_t;
413
b5256303
TC
414/*
415 * Encrypted blocks will need to be stored encrypted on the L2ARC
416 * disk as they appear in the main pool. In order for this to work we
417 * need to pass around the encryption parameters so they can be used
418 * to write data to the L2ARC. This struct is only defined in the
419 * arc_buf_hdr_t if the L1 header is defined and has the ARC_FLAG_ENCRYPTED
420 * flag set.
421 */
422typedef struct arc_buf_hdr_crypt {
423 abd_t *b_rabd; /* raw encrypted data */
424 dmu_object_type_t b_ot; /* object type */
425 uint32_t b_ebufcnt; /* count of encrypted buffers */
426
427 /* dsobj for looking up encryption key for l2arc encryption */
428 uint64_t b_dsobj;
429
430 /* encryption parameters */
431 uint8_t b_salt[ZIO_DATA_SALT_LEN];
432 uint8_t b_iv[ZIO_DATA_IV_LEN];
433
434 /*
435 * Technically this could be removed since we will always be able to
436 * get the mac from the bp when we need it. However, it is inconvenient
437 * for callers of arc code to have to pass a bp in all the time. This
438 * also allows us to assert that L2ARC data is properly encrypted to
439 * match the data in the main storage pool.
440 */
441 uint8_t b_mac[ZIO_DATA_MAC_LEN];
442} arc_buf_hdr_crypt_t;
443
b9541d6b
CW
444typedef struct l2arc_buf_hdr {
445 /* protected by arc_buf_hdr mutex */
446 l2arc_dev_t *b_dev; /* L2ARC device */
447 uint64_t b_daddr; /* disk address, offset byte */
b9541d6b 448 uint32_t b_hits;
08532162 449 arc_state_type_t b_arcs_state;
b9541d6b
CW
450 list_node_t b_l2node;
451} l2arc_buf_hdr_t;
452
49ee64e5
NB
453typedef struct l2arc_write_callback {
454 l2arc_dev_t *l2wcb_dev; /* device info */
455 arc_buf_hdr_t *l2wcb_head; /* head of write buflist */
77f6826b
GA
456 /* in-flight list of log blocks */
457 list_t l2wcb_abd_list;
49ee64e5
NB
458} l2arc_write_callback_t;
459
b9541d6b
CW
460struct arc_buf_hdr {
461 /* protected by hash lock */
462 dva_t b_dva;
463 uint64_t b_birth;
b9541d6b 464
d3c2ae1c 465 arc_buf_contents_t b_type;
10b3c7f5
MN
466 uint8_t b_complevel;
467 uint8_t b_reserved1; /* used for 4 byte alignment */
468 uint16_t b_reserved2; /* used for 4 byte alignment */
b9541d6b
CW
469 arc_buf_hdr_t *b_hash_next;
470 arc_flags_t b_flags;
471
d3c2ae1c
GW
472 /*
473 * This field stores the size of the data buffer after
474 * compression, and is set in the arc's zio completion handlers.
475 * It is in units of SPA_MINBLOCKSIZE (e.g. 1 == 512 bytes).
476 *
477 * While the block pointers can store up to 32MB in their psize
478 * field, we can only store up to 32MB minus 512B. This is due
479 * to the bp using a bias of 1, whereas we use a bias of 0 (i.e.
480 * a field of zeros represents 512B in the bp). We can't use a
481 * bias of 1 since we need to reserve a psize of zero, here, to
482 * represent holes and embedded blocks.
483 *
484 * This isn't a problem in practice, since the maximum size of a
485 * buffer is limited to 16MB, so we never need to store 32MB in
486 * this field. Even in the upstream illumos code base, the
487 * maximum size of a buffer is limited to 16MB.
488 */
489 uint16_t b_psize;
490
491 /*
492 * This field stores the size of the data buffer before
493 * compression, and cannot change once set. It is in units
494 * of SPA_MINBLOCKSIZE (e.g. 2 == 1024 bytes)
495 */
496 uint16_t b_lsize; /* immutable */
497 uint64_t b_spa; /* immutable */
b9541d6b
CW
498
499 /* L2ARC fields. Undefined when not in L2ARC. */
500 l2arc_buf_hdr_t b_l2hdr;
501 /* L1ARC fields. Undefined when in l2arc_only state */
502 l1arc_buf_hdr_t b_l1hdr;
b5256303
TC
503 /*
504 * Encryption parameters. Defined only when ARC_FLAG_ENCRYPTED
505 * is set and the L1 header exists.
506 */
507 arc_buf_hdr_crypt_t b_crypt_hdr;
b9541d6b 508};
13a4027a
MM
509
510typedef struct arc_stats {
511 kstat_named_t arcstat_hits;
512 kstat_named_t arcstat_misses;
513 kstat_named_t arcstat_demand_data_hits;
514 kstat_named_t arcstat_demand_data_misses;
515 kstat_named_t arcstat_demand_metadata_hits;
516 kstat_named_t arcstat_demand_metadata_misses;
517 kstat_named_t arcstat_prefetch_data_hits;
518 kstat_named_t arcstat_prefetch_data_misses;
519 kstat_named_t arcstat_prefetch_metadata_hits;
520 kstat_named_t arcstat_prefetch_metadata_misses;
521 kstat_named_t arcstat_mru_hits;
522 kstat_named_t arcstat_mru_ghost_hits;
523 kstat_named_t arcstat_mfu_hits;
524 kstat_named_t arcstat_mfu_ghost_hits;
525 kstat_named_t arcstat_deleted;
526 /*
527 * Number of buffers that could not be evicted because the hash lock
528 * was held by another thread. The lock may not necessarily be held
529 * by something using the same buffer, since hash locks are shared
530 * by multiple buffers.
531 */
532 kstat_named_t arcstat_mutex_miss;
533 /*
534 * Number of buffers skipped when updating the access state due to the
535 * header having already been released after acquiring the hash lock.
536 */
537 kstat_named_t arcstat_access_skip;
538 /*
539 * Number of buffers skipped because they have I/O in progress, are
540 * indirect prefetch buffers that have not lived long enough, or are
541 * not from the spa we're trying to evict from.
542 */
543 kstat_named_t arcstat_evict_skip;
544 /*
545 * Number of times arc_evict_state() was unable to evict enough
546 * buffers to reach its target amount.
547 */
548 kstat_named_t arcstat_evict_not_enough;
549 kstat_named_t arcstat_evict_l2_cached;
550 kstat_named_t arcstat_evict_l2_eligible;
08532162
GA
551 kstat_named_t arcstat_evict_l2_eligible_mfu;
552 kstat_named_t arcstat_evict_l2_eligible_mru;
13a4027a
MM
553 kstat_named_t arcstat_evict_l2_ineligible;
554 kstat_named_t arcstat_evict_l2_skip;
555 kstat_named_t arcstat_hash_elements;
556 kstat_named_t arcstat_hash_elements_max;
557 kstat_named_t arcstat_hash_collisions;
558 kstat_named_t arcstat_hash_chains;
559 kstat_named_t arcstat_hash_chain_max;
560 kstat_named_t arcstat_p;
561 kstat_named_t arcstat_c;
562 kstat_named_t arcstat_c_min;
563 kstat_named_t arcstat_c_max;
13a4027a
MM
564 kstat_named_t arcstat_size;
565 /*
566 * Number of compressed bytes stored in the arc_buf_hdr_t's b_pabd.
567 * Note that the compressed bytes may match the uncompressed bytes
568 * if the block is either not compressed or compressed arc is disabled.
569 */
570 kstat_named_t arcstat_compressed_size;
571 /*
572 * Uncompressed size of the data stored in b_pabd. If compressed
573 * arc is disabled then this value will be identical to the stat
574 * above.
575 */
576 kstat_named_t arcstat_uncompressed_size;
577 /*
578 * Number of bytes stored in all the arc_buf_t's. This is classified
579 * as "overhead" since this data is typically short-lived and will
580 * be evicted from the arc when it becomes unreferenced unless the
581 * zfs_keep_uncompressed_metadata or zfs_keep_uncompressed_level
582 * values have been set (see comment in dbuf.c for more information).
583 */
584 kstat_named_t arcstat_overhead_size;
585 /*
586 * Number of bytes consumed by internal ARC structures necessary
587 * for tracking purposes; these structures are not actually
588 * backed by ARC buffers. This includes arc_buf_hdr_t structures
589 * (allocated via arc_buf_hdr_t_full and arc_buf_hdr_t_l2only
590 * caches), and arc_buf_t structures (allocated via arc_buf_t
591 * cache).
13a4027a
MM
592 */
593 kstat_named_t arcstat_hdr_size;
594 /*
595 * Number of bytes consumed by ARC buffers of type equal to
596 * ARC_BUFC_DATA. This is generally consumed by buffers backing
597 * on disk user data (e.g. plain file contents).
13a4027a
MM
598 */
599 kstat_named_t arcstat_data_size;
600 /*
601 * Number of bytes consumed by ARC buffers of type equal to
602 * ARC_BUFC_METADATA. This is generally consumed by buffers
603 * backing on disk data that is used for internal ZFS
604 * structures (e.g. ZAP, dnode, indirect blocks, etc).
13a4027a
MM
605 */
606 kstat_named_t arcstat_metadata_size;
607 /*
608 * Number of bytes consumed by dmu_buf_impl_t objects.
13a4027a
MM
609 */
610 kstat_named_t arcstat_dbuf_size;
611 /*
612 * Number of bytes consumed by dnode_t objects.
13a4027a
MM
613 */
614 kstat_named_t arcstat_dnode_size;
615 /*
616 * Number of bytes consumed by bonus buffers.
13a4027a
MM
617 */
618 kstat_named_t arcstat_bonus_size;
1c2725a1
MM
619#if defined(COMPAT_FREEBSD11)
620 /*
621 * Sum of the previous three counters, provided for compatibility.
622 */
623 kstat_named_t arcstat_other_size;
624#endif
625
13a4027a
MM
626 /*
627 * Total number of bytes consumed by ARC buffers residing in the
628 * arc_anon state. This includes *all* buffers in the arc_anon
629 * state; e.g. data, metadata, evictable, and unevictable buffers
630 * are all included in this value.
13a4027a
MM
631 */
632 kstat_named_t arcstat_anon_size;
633 /*
634 * Number of bytes consumed by ARC buffers that meet the
635 * following criteria: backing buffers of type ARC_BUFC_DATA,
636 * residing in the arc_anon state, and are eligible for eviction
637 * (e.g. have no outstanding holds on the buffer).
13a4027a
MM
638 */
639 kstat_named_t arcstat_anon_evictable_data;
640 /*
641 * Number of bytes consumed by ARC buffers that meet the
642 * following criteria: backing buffers of type ARC_BUFC_METADATA,
643 * residing in the arc_anon state, and are eligible for eviction
644 * (e.g. have no outstanding holds on the buffer).
13a4027a
MM
645 */
646 kstat_named_t arcstat_anon_evictable_metadata;
647 /*
648 * Total number of bytes consumed by ARC buffers residing in the
649 * arc_mru state. This includes *all* buffers in the arc_mru
650 * state; e.g. data, metadata, evictable, and unevictable buffers
651 * are all included in this value.
13a4027a
MM
652 */
653 kstat_named_t arcstat_mru_size;
654 /*
655 * Number of bytes consumed by ARC buffers that meet the
656 * following criteria: backing buffers of type ARC_BUFC_DATA,
657 * residing in the arc_mru state, and are eligible for eviction
658 * (e.g. have no outstanding holds on the buffer).
13a4027a
MM
659 */
660 kstat_named_t arcstat_mru_evictable_data;
661 /*
662 * Number of bytes consumed by ARC buffers that meet the
663 * following criteria: backing buffers of type ARC_BUFC_METADATA,
664 * residing in the arc_mru state, and are eligible for eviction
665 * (e.g. have no outstanding holds on the buffer).
13a4027a
MM
666 */
667 kstat_named_t arcstat_mru_evictable_metadata;
668 /*
669 * Total number of bytes that *would have been* consumed by ARC
670 * buffers in the arc_mru_ghost state. The key thing to note
671 * here, is the fact that this size doesn't actually indicate
672 * RAM consumption. The ghost lists only consist of headers and
673 * don't actually have ARC buffers linked off of these headers.
674 * Thus, *if* the headers had associated ARC buffers, these
675 * buffers *would have* consumed this number of bytes.
13a4027a
MM
676 */
677 kstat_named_t arcstat_mru_ghost_size;
678 /*
679 * Number of bytes that *would have been* consumed by ARC
680 * buffers that are eligible for eviction, of type
681 * ARC_BUFC_DATA, and linked off the arc_mru_ghost state.
13a4027a
MM
682 */
683 kstat_named_t arcstat_mru_ghost_evictable_data;
684 /*
685 * Number of bytes that *would have been* consumed by ARC
686 * buffers that are eligible for eviction, of type
687 * ARC_BUFC_METADATA, and linked off the arc_mru_ghost state.
13a4027a
MM
688 */
689 kstat_named_t arcstat_mru_ghost_evictable_metadata;
690 /*
691 * Total number of bytes consumed by ARC buffers residing in the
692 * arc_mfu state. This includes *all* buffers in the arc_mfu
693 * state; e.g. data, metadata, evictable, and unevictable buffers
694 * are all included in this value.
13a4027a
MM
695 */
696 kstat_named_t arcstat_mfu_size;
697 /*
698 * Number of bytes consumed by ARC buffers that are eligible for
699 * eviction, of type ARC_BUFC_DATA, and reside in the arc_mfu
700 * state.
13a4027a
MM
701 */
702 kstat_named_t arcstat_mfu_evictable_data;
703 /*
704 * Number of bytes consumed by ARC buffers that are eligible for
705 * eviction, of type ARC_BUFC_METADATA, and reside in the
706 * arc_mfu state.
13a4027a
MM
707 */
708 kstat_named_t arcstat_mfu_evictable_metadata;
709 /*
710 * Total number of bytes that *would have been* consumed by ARC
711 * buffers in the arc_mfu_ghost state. See the comment above
712 * arcstat_mru_ghost_size for more details.
13a4027a
MM
713 */
714 kstat_named_t arcstat_mfu_ghost_size;
715 /*
716 * Number of bytes that *would have been* consumed by ARC
717 * buffers that are eligible for eviction, of type
718 * ARC_BUFC_DATA, and linked off the arc_mfu_ghost state.
13a4027a
MM
719 */
720 kstat_named_t arcstat_mfu_ghost_evictable_data;
721 /*
722 * Number of bytes that *would have been* consumed by ARC
723 * buffers that are eligible for eviction, of type
724 * ARC_BUFC_METADATA, and linked off the arc_mru_ghost state.
13a4027a
MM
725 */
726 kstat_named_t arcstat_mfu_ghost_evictable_metadata;
727 kstat_named_t arcstat_l2_hits;
728 kstat_named_t arcstat_l2_misses;
08532162
GA
729 /*
730 * Allocated size (in bytes) of L2ARC cached buffers by ARC state.
731 */
732 kstat_named_t arcstat_l2_prefetch_asize;
733 kstat_named_t arcstat_l2_mru_asize;
734 kstat_named_t arcstat_l2_mfu_asize;
735 /*
736 * Allocated size (in bytes) of L2ARC cached buffers by buffer content
737 * type.
738 */
739 kstat_named_t arcstat_l2_bufc_data_asize;
740 kstat_named_t arcstat_l2_bufc_metadata_asize;
13a4027a
MM
741 kstat_named_t arcstat_l2_feeds;
742 kstat_named_t arcstat_l2_rw_clash;
743 kstat_named_t arcstat_l2_read_bytes;
744 kstat_named_t arcstat_l2_write_bytes;
745 kstat_named_t arcstat_l2_writes_sent;
746 kstat_named_t arcstat_l2_writes_done;
747 kstat_named_t arcstat_l2_writes_error;
748 kstat_named_t arcstat_l2_writes_lock_retry;
749 kstat_named_t arcstat_l2_evict_lock_retry;
750 kstat_named_t arcstat_l2_evict_reading;
751 kstat_named_t arcstat_l2_evict_l1cached;
752 kstat_named_t arcstat_l2_free_on_write;
753 kstat_named_t arcstat_l2_abort_lowmem;
754 kstat_named_t arcstat_l2_cksum_bad;
755 kstat_named_t arcstat_l2_io_error;
756 kstat_named_t arcstat_l2_lsize;
757 kstat_named_t arcstat_l2_psize;
13a4027a 758 kstat_named_t arcstat_l2_hdr_size;
77f6826b
GA
759 /*
760 * Number of L2ARC log blocks written. These are used for restoring the
761 * L2ARC. Updated during writing of L2ARC log blocks.
762 */
763 kstat_named_t arcstat_l2_log_blk_writes;
764 /*
657fd33b 765 * Moving average of the aligned size of the L2ARC log blocks, in
77f6826b
GA
766 * bytes. Updated during L2ARC rebuild and during writing of L2ARC
767 * log blocks.
768 */
657fd33b
GA
769 kstat_named_t arcstat_l2_log_blk_avg_asize;
770 /* Aligned size of L2ARC log blocks on L2ARC devices. */
771 kstat_named_t arcstat_l2_log_blk_asize;
772 /* Number of L2ARC log blocks present on L2ARC devices. */
773 kstat_named_t arcstat_l2_log_blk_count;
77f6826b 774 /*
657fd33b
GA
775 * Moving average of the aligned size of L2ARC restored data, in bytes,
776 * to the aligned size of their metadata in L2ARC, in bytes.
77f6826b
GA
777 * Updated during L2ARC rebuild and during writing of L2ARC log blocks.
778 */
779 kstat_named_t arcstat_l2_data_to_meta_ratio;
780 /*
781 * Number of times the L2ARC rebuild was successful for an L2ARC device.
782 */
783 kstat_named_t arcstat_l2_rebuild_success;
784 /*
785 * Number of times the L2ARC rebuild failed because the device header
786 * was in an unsupported format or corrupted.
787 */
788 kstat_named_t arcstat_l2_rebuild_abort_unsupported;
789 /*
790 * Number of times the L2ARC rebuild failed because of IO errors
791 * while reading a log block.
792 */
793 kstat_named_t arcstat_l2_rebuild_abort_io_errors;
794 /*
795 * Number of times the L2ARC rebuild failed because of IO errors when
796 * reading the device header.
797 */
798 kstat_named_t arcstat_l2_rebuild_abort_dh_errors;
799 /*
800 * Number of L2ARC log blocks which failed to be restored due to
801 * checksum errors.
802 */
803 kstat_named_t arcstat_l2_rebuild_abort_cksum_lb_errors;
804 /*
805 * Number of times the L2ARC rebuild was aborted due to low system
806 * memory.
807 */
808 kstat_named_t arcstat_l2_rebuild_abort_lowmem;
809 /* Logical size of L2ARC restored data, in bytes. */
810 kstat_named_t arcstat_l2_rebuild_size;
657fd33b
GA
811 /* Aligned size of L2ARC restored data, in bytes. */
812 kstat_named_t arcstat_l2_rebuild_asize;
77f6826b
GA
813 /*
814 * Number of L2ARC log entries (buffers) that were successfully
815 * restored in ARC.
816 */
817 kstat_named_t arcstat_l2_rebuild_bufs;
818 /*
819 * Number of L2ARC log entries (buffers) already cached in ARC. These
820 * were not restored again.
821 */
822 kstat_named_t arcstat_l2_rebuild_bufs_precached;
77f6826b
GA
823 /*
824 * Number of L2ARC log blocks that were restored successfully. Each
825 * log block may hold up to L2ARC_LOG_BLK_MAX_ENTRIES buffers.
826 */
827 kstat_named_t arcstat_l2_rebuild_log_blks;
13a4027a
MM
828 kstat_named_t arcstat_memory_throttle_count;
829 kstat_named_t arcstat_memory_direct_count;
830 kstat_named_t arcstat_memory_indirect_count;
831 kstat_named_t arcstat_memory_all_bytes;
832 kstat_named_t arcstat_memory_free_bytes;
833 kstat_named_t arcstat_memory_available_bytes;
834 kstat_named_t arcstat_no_grow;
835 kstat_named_t arcstat_tempreserve;
836 kstat_named_t arcstat_loaned_bytes;
837 kstat_named_t arcstat_prune;
13a4027a
MM
838 kstat_named_t arcstat_meta_used;
839 kstat_named_t arcstat_meta_limit;
840 kstat_named_t arcstat_dnode_limit;
841 kstat_named_t arcstat_meta_max;
842 kstat_named_t arcstat_meta_min;
843 kstat_named_t arcstat_async_upgrade_sync;
844 kstat_named_t arcstat_demand_hit_predictive_prefetch;
845 kstat_named_t arcstat_demand_hit_prescient_prefetch;
846 kstat_named_t arcstat_need_free;
847 kstat_named_t arcstat_sys_free;
848 kstat_named_t arcstat_raw_size;
1dc32a67 849 kstat_named_t arcstat_cached_only_in_progress;
85ec5cba 850 kstat_named_t arcstat_abd_chunk_waste_size;
13a4027a
MM
851} arc_stats_t;
852
c4c162c1
AM
853typedef struct arc_sums {
854 wmsum_t arcstat_hits;
855 wmsum_t arcstat_misses;
856 wmsum_t arcstat_demand_data_hits;
857 wmsum_t arcstat_demand_data_misses;
858 wmsum_t arcstat_demand_metadata_hits;
859 wmsum_t arcstat_demand_metadata_misses;
860 wmsum_t arcstat_prefetch_data_hits;
861 wmsum_t arcstat_prefetch_data_misses;
862 wmsum_t arcstat_prefetch_metadata_hits;
863 wmsum_t arcstat_prefetch_metadata_misses;
864 wmsum_t arcstat_mru_hits;
865 wmsum_t arcstat_mru_ghost_hits;
866 wmsum_t arcstat_mfu_hits;
867 wmsum_t arcstat_mfu_ghost_hits;
868 wmsum_t arcstat_deleted;
869 wmsum_t arcstat_mutex_miss;
870 wmsum_t arcstat_access_skip;
871 wmsum_t arcstat_evict_skip;
872 wmsum_t arcstat_evict_not_enough;
873 wmsum_t arcstat_evict_l2_cached;
874 wmsum_t arcstat_evict_l2_eligible;
875 wmsum_t arcstat_evict_l2_eligible_mfu;
876 wmsum_t arcstat_evict_l2_eligible_mru;
877 wmsum_t arcstat_evict_l2_ineligible;
878 wmsum_t arcstat_evict_l2_skip;
879 wmsum_t arcstat_hash_collisions;
880 wmsum_t arcstat_hash_chains;
881 aggsum_t arcstat_size;
882 wmsum_t arcstat_compressed_size;
883 wmsum_t arcstat_uncompressed_size;
884 wmsum_t arcstat_overhead_size;
885 wmsum_t arcstat_hdr_size;
886 wmsum_t arcstat_data_size;
887 wmsum_t arcstat_metadata_size;
888 wmsum_t arcstat_dbuf_size;
889 aggsum_t arcstat_dnode_size;
890 wmsum_t arcstat_bonus_size;
891 wmsum_t arcstat_l2_hits;
892 wmsum_t arcstat_l2_misses;
893 wmsum_t arcstat_l2_prefetch_asize;
894 wmsum_t arcstat_l2_mru_asize;
895 wmsum_t arcstat_l2_mfu_asize;
896 wmsum_t arcstat_l2_bufc_data_asize;
897 wmsum_t arcstat_l2_bufc_metadata_asize;
898 wmsum_t arcstat_l2_feeds;
899 wmsum_t arcstat_l2_rw_clash;
900 wmsum_t arcstat_l2_read_bytes;
901 wmsum_t arcstat_l2_write_bytes;
902 wmsum_t arcstat_l2_writes_sent;
903 wmsum_t arcstat_l2_writes_done;
904 wmsum_t arcstat_l2_writes_error;
905 wmsum_t arcstat_l2_writes_lock_retry;
906 wmsum_t arcstat_l2_evict_lock_retry;
907 wmsum_t arcstat_l2_evict_reading;
908 wmsum_t arcstat_l2_evict_l1cached;
909 wmsum_t arcstat_l2_free_on_write;
910 wmsum_t arcstat_l2_abort_lowmem;
911 wmsum_t arcstat_l2_cksum_bad;
912 wmsum_t arcstat_l2_io_error;
913 wmsum_t arcstat_l2_lsize;
914 wmsum_t arcstat_l2_psize;
915 aggsum_t arcstat_l2_hdr_size;
916 wmsum_t arcstat_l2_log_blk_writes;
917 wmsum_t arcstat_l2_log_blk_asize;
918 wmsum_t arcstat_l2_log_blk_count;
919 wmsum_t arcstat_l2_rebuild_success;
920 wmsum_t arcstat_l2_rebuild_abort_unsupported;
921 wmsum_t arcstat_l2_rebuild_abort_io_errors;
922 wmsum_t arcstat_l2_rebuild_abort_dh_errors;
923 wmsum_t arcstat_l2_rebuild_abort_cksum_lb_errors;
924 wmsum_t arcstat_l2_rebuild_abort_lowmem;
925 wmsum_t arcstat_l2_rebuild_size;
926 wmsum_t arcstat_l2_rebuild_asize;
927 wmsum_t arcstat_l2_rebuild_bufs;
928 wmsum_t arcstat_l2_rebuild_bufs_precached;
929 wmsum_t arcstat_l2_rebuild_log_blks;
930 wmsum_t arcstat_memory_throttle_count;
931 wmsum_t arcstat_memory_direct_count;
932 wmsum_t arcstat_memory_indirect_count;
933 wmsum_t arcstat_prune;
934 aggsum_t arcstat_meta_used;
935 wmsum_t arcstat_async_upgrade_sync;
936 wmsum_t arcstat_demand_hit_predictive_prefetch;
937 wmsum_t arcstat_demand_hit_prescient_prefetch;
938 wmsum_t arcstat_raw_size;
939 wmsum_t arcstat_cached_only_in_progress;
940 wmsum_t arcstat_abd_chunk_waste_size;
941} arc_sums_t;
942
3442c2a0
MA
943typedef struct arc_evict_waiter {
944 list_node_t aew_node;
945 kcondvar_t aew_cv;
946 uint64_t aew_count;
947} arc_evict_waiter_t;
c9c9c1e2
MM
948
949#define ARCSTAT(stat) (arc_stats.stat.value.ui64)
950
951#define ARCSTAT_INCR(stat, val) \
c4c162c1 952 wmsum_add(&arc_sums.stat, (val))
c9c9c1e2
MM
953
954#define ARCSTAT_BUMP(stat) ARCSTAT_INCR(stat, 1)
955#define ARCSTAT_BUMPDOWN(stat) ARCSTAT_INCR(stat, -1)
956
957#define arc_no_grow ARCSTAT(arcstat_no_grow) /* do not grow cache size */
958#define arc_p ARCSTAT(arcstat_p) /* target size of MRU */
959#define arc_c ARCSTAT(arcstat_c) /* target size of cache */
960#define arc_c_min ARCSTAT(arcstat_c_min) /* min target cache size */
961#define arc_c_max ARCSTAT(arcstat_c_max) /* max target cache size */
962#define arc_sys_free ARCSTAT(arcstat_sys_free) /* target system free bytes */
c9c9c1e2 963
8172df64
AM
964#define arc_anon (&ARC_anon)
965#define arc_mru (&ARC_mru)
966#define arc_mru_ghost (&ARC_mru_ghost)
967#define arc_mfu (&ARC_mfu)
968#define arc_mfu_ghost (&ARC_mfu_ghost)
969#define arc_l2c_only (&ARC_l2c_only)
970
c9c9c1e2
MM
971extern taskq_t *arc_prune_taskq;
972extern arc_stats_t arc_stats;
c4c162c1 973extern arc_sums_t arc_sums;
c9c9c1e2
MM
974extern hrtime_t arc_growtime;
975extern boolean_t arc_warm;
976extern int arc_grow_retry;
f7a68f99 977extern int arc_no_grow_shift;
c9c9c1e2 978extern int arc_shrink_shift;
c9c9c1e2
MM
979extern kmutex_t arc_prune_mtx;
980extern list_t arc_prune_list;
8172df64
AM
981extern arc_state_t ARC_mfu;
982extern arc_state_t ARC_mru;
c9c9c1e2
MM
983extern uint_t zfs_arc_pc_percent;
984extern int arc_lotsfree_percent;
0421f257
RM
985extern unsigned long zfs_arc_min;
986extern unsigned long zfs_arc_max;
c9c9c1e2
MM
987
988extern void arc_reduce_target_size(int64_t to_free);
989extern boolean_t arc_reclaim_needed(void);
990extern void arc_kmem_reap_soon(void);
3442c2a0 991extern void arc_wait_for_eviction(uint64_t);
c9c9c1e2
MM
992
993extern void arc_lowmem_init(void);
994extern void arc_lowmem_fini(void);
995extern void arc_prune_async(int64_t);
996extern int arc_memory_throttle(spa_t *spa, uint64_t reserve, uint64_t txg);
997extern uint64_t arc_free_memory(void);
998extern int64_t arc_available_memory(void);
36a6e233 999extern void arc_tuning_update(boolean_t);
60a4c7d2
PD
1000extern void arc_register_hotplug(void);
1001extern void arc_unregister_hotplug(void);
e3570464 1002
7e3df9db
RM
1003extern int param_set_arc_long(ZFS_MODULE_PARAM_ARGS);
1004extern int param_set_arc_int(ZFS_MODULE_PARAM_ARGS);
e945e8d7
AJ
1005extern int param_set_arc_min(ZFS_MODULE_PARAM_ARGS);
1006extern int param_set_arc_max(ZFS_MODULE_PARAM_ARGS);
c9c9c1e2 1007
77f6826b
GA
1008/* used in zdb.c */
1009boolean_t l2arc_log_blkptr_valid(l2arc_dev_t *dev,
1010 const l2arc_log_blkptr_t *lbp);
1011
b7654bd7
GA
1012/* used in vdev_trim.c */
1013void l2arc_dev_hdr_update(l2arc_dev_t *dev);
1014l2arc_dev_t *l2arc_vdev_get(vdev_t *vd);
1015
59ec819a
NB
1016#ifdef __cplusplus
1017}
1018#endif
1019
1020#endif /* _SYS_ARC_IMPL_H */