]> git.proxmox.com Git - mirror_zfs.git/blame - module/zfs/arc.c
Illumos #1748: desire support for reguid in zfs
[mirror_zfs.git] / module / zfs / arc.c
CommitLineData
34dc7c2f
BB
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21/*
428870ff 22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
3541dc6d
GA
23 * Copyright 2011 Nexenta Systems, Inc. All rights reserved.
24 * Copyright (c) 2011 by Delphix. All rights reserved.
34dc7c2f
BB
25 */
26
34dc7c2f
BB
27/*
28 * DVA-based Adjustable Replacement Cache
29 *
30 * While much of the theory of operation used here is
31 * based on the self-tuning, low overhead replacement cache
32 * presented by Megiddo and Modha at FAST 2003, there are some
33 * significant differences:
34 *
35 * 1. The Megiddo and Modha model assumes any page is evictable.
36 * Pages in its cache cannot be "locked" into memory. This makes
37 * the eviction algorithm simple: evict the last page in the list.
38 * This also make the performance characteristics easy to reason
39 * about. Our cache is not so simple. At any given moment, some
40 * subset of the blocks in the cache are un-evictable because we
41 * have handed out a reference to them. Blocks are only evictable
42 * when there are no external references active. This makes
43 * eviction far more problematic: we choose to evict the evictable
44 * blocks that are the "lowest" in the list.
45 *
46 * There are times when it is not possible to evict the requested
47 * space. In these circumstances we are unable to adjust the cache
48 * size. To prevent the cache growing unbounded at these times we
49 * implement a "cache throttle" that slows the flow of new data
50 * into the cache until we can make space available.
51 *
52 * 2. The Megiddo and Modha model assumes a fixed cache size.
53 * Pages are evicted when the cache is full and there is a cache
54 * miss. Our model has a variable sized cache. It grows with
55 * high use, but also tries to react to memory pressure from the
56 * operating system: decreasing its size when system memory is
57 * tight.
58 *
59 * 3. The Megiddo and Modha model assumes a fixed page size. All
60 * elements of the cache are therefor exactly the same size. So
61 * when adjusting the cache size following a cache miss, its simply
62 * a matter of choosing a single page to evict. In our model, we
63 * have variable sized cache blocks (rangeing from 512 bytes to
64 * 128K bytes). We therefor choose a set of blocks to evict to make
65 * space for a cache miss that approximates as closely as possible
66 * the space used by the new block.
67 *
68 * See also: "ARC: A Self-Tuning, Low Overhead Replacement Cache"
69 * by N. Megiddo & D. Modha, FAST 2003
70 */
71
72/*
73 * The locking model:
74 *
75 * A new reference to a cache buffer can be obtained in two
76 * ways: 1) via a hash table lookup using the DVA as a key,
77 * or 2) via one of the ARC lists. The arc_read() interface
78 * uses method 1, while the internal arc algorithms for
79 * adjusting the cache use method 2. We therefor provide two
80 * types of locks: 1) the hash table lock array, and 2) the
81 * arc list locks.
82 *
83 * Buffers do not have their own mutexs, rather they rely on the
84 * hash table mutexs for the bulk of their protection (i.e. most
85 * fields in the arc_buf_hdr_t are protected by these mutexs).
86 *
87 * buf_hash_find() returns the appropriate mutex (held) when it
88 * locates the requested buffer in the hash table. It returns
89 * NULL for the mutex if the buffer was not in the table.
90 *
91 * buf_hash_remove() expects the appropriate hash mutex to be
92 * already held before it is invoked.
93 *
94 * Each arc state also has a mutex which is used to protect the
95 * buffer list associated with the state. When attempting to
96 * obtain a hash table lock while holding an arc list lock you
97 * must use: mutex_tryenter() to avoid deadlock. Also note that
98 * the active state mutex must be held before the ghost state mutex.
99 *
100 * Arc buffers may have an associated eviction callback function.
101 * This function will be invoked prior to removing the buffer (e.g.
102 * in arc_do_user_evicts()). Note however that the data associated
103 * with the buffer may be evicted prior to the callback. The callback
104 * must be made with *no locks held* (to prevent deadlock). Additionally,
105 * the users of callbacks must ensure that their private data is
106 * protected from simultaneous callbacks from arc_buf_evict()
107 * and arc_do_user_evicts().
108 *
ab26409d
BB
109 * It as also possible to register a callback which is run when the
110 * arc_meta_limit is reached and no buffers can be safely evicted. In
111 * this case the arc user should drop a reference on some arc buffers so
112 * they can be reclaimed and the arc_meta_limit honored. For example,
113 * when using the ZPL each dentry holds a references on a znode. These
114 * dentries must be pruned before the arc buffer holding the znode can
115 * be safely evicted.
116 *
34dc7c2f
BB
117 * Note that the majority of the performance stats are manipulated
118 * with atomic operations.
119 *
120 * The L2ARC uses the l2arc_buflist_mtx global mutex for the following:
121 *
122 * - L2ARC buflist creation
123 * - L2ARC buflist eviction
124 * - L2ARC write completion, which walks L2ARC buflists
125 * - ARC header destruction, as it removes from L2ARC buflists
126 * - ARC header release, as it removes from L2ARC buflists
127 */
128
129#include <sys/spa.h>
130#include <sys/zio.h>
34dc7c2f
BB
131#include <sys/zfs_context.h>
132#include <sys/arc.h>
b128c09f 133#include <sys/vdev.h>
9babb374 134#include <sys/vdev_impl.h>
34dc7c2f
BB
135#ifdef _KERNEL
136#include <sys/vmsystm.h>
137#include <vm/anon.h>
138#include <sys/fs/swapnode.h>
ab26409d 139#include <sys/zpl.h>
34dc7c2f
BB
140#endif
141#include <sys/callb.h>
142#include <sys/kstat.h>
570827e1 143#include <sys/dmu_tx.h>
428870ff 144#include <zfs_fletcher.h>
34dc7c2f
BB
145
146static kmutex_t arc_reclaim_thr_lock;
147static kcondvar_t arc_reclaim_thr_cv; /* used to signal reclaim thr */
148static uint8_t arc_thread_exit;
149
150extern int zfs_write_limit_shift;
151extern uint64_t zfs_write_limit_max;
b128c09f 152extern kmutex_t zfs_write_limit_lock;
34dc7c2f 153
ab26409d
BB
154/* number of bytes to prune from caches when at arc_meta_limit is reached */
155uint_t arc_meta_prune = 1048576;
34dc7c2f
BB
156
157typedef enum arc_reclaim_strategy {
158 ARC_RECLAIM_AGGR, /* Aggressive reclaim strategy */
159 ARC_RECLAIM_CONS /* Conservative reclaim strategy */
160} arc_reclaim_strategy_t;
161
162/* number of seconds before growing cache again */
302f753f
BB
163static int arc_grow_retry = 5;
164
165/* expiration time for arc_no_grow */
166static clock_t arc_grow_time = 0;
34dc7c2f 167
d164b209
BB
168/* shift of arc_c for calculating both min and max arc_p */
169static int arc_p_min_shift = 4;
170
171/* log2(fraction of arc to reclaim) */
172static int arc_shrink_shift = 5;
173
34dc7c2f
BB
174/*
175 * minimum lifespan of a prefetch block in clock ticks
176 * (initialized in arc_init())
177 */
178static int arc_min_prefetch_lifespan;
179
180static int arc_dead;
181
b128c09f
BB
182/*
183 * The arc has filled available memory and has now warmed up.
184 */
185static boolean_t arc_warm;
186
34dc7c2f
BB
187/*
188 * These tunables are for performance analysis.
189 */
c28b2279
BB
190unsigned long zfs_arc_max = 0;
191unsigned long zfs_arc_min = 0;
192unsigned long zfs_arc_meta_limit = 0;
d164b209
BB
193int zfs_arc_grow_retry = 0;
194int zfs_arc_shrink_shift = 0;
195int zfs_arc_p_min_shift = 0;
ab26409d 196int zfs_arc_meta_prune = 0;
34dc7c2f
BB
197
198/*
199 * Note that buffers can be in one of 6 states:
200 * ARC_anon - anonymous (discussed below)
201 * ARC_mru - recently used, currently cached
202 * ARC_mru_ghost - recentely used, no longer in cache
203 * ARC_mfu - frequently used, currently cached
204 * ARC_mfu_ghost - frequently used, no longer in cache
205 * ARC_l2c_only - exists in L2ARC but not other states
206 * When there are no active references to the buffer, they are
207 * are linked onto a list in one of these arc states. These are
208 * the only buffers that can be evicted or deleted. Within each
209 * state there are multiple lists, one for meta-data and one for
210 * non-meta-data. Meta-data (indirect blocks, blocks of dnodes,
211 * etc.) is tracked separately so that it can be managed more
212 * explicitly: favored over data, limited explicitly.
213 *
214 * Anonymous buffers are buffers that are not associated with
215 * a DVA. These are buffers that hold dirty block copies
216 * before they are written to stable storage. By definition,
217 * they are "ref'd" and are considered part of arc_mru
218 * that cannot be freed. Generally, they will aquire a DVA
219 * as they are written and migrate onto the arc_mru list.
220 *
221 * The ARC_l2c_only state is for buffers that are in the second
222 * level ARC but no longer in any of the ARC_m* lists. The second
223 * level ARC itself may also contain buffers that are in any of
224 * the ARC_m* states - meaning that a buffer can exist in two
225 * places. The reason for the ARC_l2c_only state is to keep the
226 * buffer header in the hash table, so that reads that hit the
227 * second level ARC benefit from these fast lookups.
228 */
229
230typedef struct arc_state {
231 list_t arcs_list[ARC_BUFC_NUMTYPES]; /* list of evictable buffers */
232 uint64_t arcs_lsize[ARC_BUFC_NUMTYPES]; /* amount of evictable data */
233 uint64_t arcs_size; /* total amount of data in this state */
234 kmutex_t arcs_mtx;
235} arc_state_t;
236
237/* The 6 states: */
238static arc_state_t ARC_anon;
239static arc_state_t ARC_mru;
240static arc_state_t ARC_mru_ghost;
241static arc_state_t ARC_mfu;
242static arc_state_t ARC_mfu_ghost;
243static arc_state_t ARC_l2c_only;
244
245typedef struct arc_stats {
246 kstat_named_t arcstat_hits;
247 kstat_named_t arcstat_misses;
248 kstat_named_t arcstat_demand_data_hits;
249 kstat_named_t arcstat_demand_data_misses;
250 kstat_named_t arcstat_demand_metadata_hits;
251 kstat_named_t arcstat_demand_metadata_misses;
252 kstat_named_t arcstat_prefetch_data_hits;
253 kstat_named_t arcstat_prefetch_data_misses;
254 kstat_named_t arcstat_prefetch_metadata_hits;
255 kstat_named_t arcstat_prefetch_metadata_misses;
256 kstat_named_t arcstat_mru_hits;
257 kstat_named_t arcstat_mru_ghost_hits;
258 kstat_named_t arcstat_mfu_hits;
259 kstat_named_t arcstat_mfu_ghost_hits;
260 kstat_named_t arcstat_deleted;
261 kstat_named_t arcstat_recycle_miss;
262 kstat_named_t arcstat_mutex_miss;
263 kstat_named_t arcstat_evict_skip;
428870ff
BB
264 kstat_named_t arcstat_evict_l2_cached;
265 kstat_named_t arcstat_evict_l2_eligible;
266 kstat_named_t arcstat_evict_l2_ineligible;
34dc7c2f
BB
267 kstat_named_t arcstat_hash_elements;
268 kstat_named_t arcstat_hash_elements_max;
269 kstat_named_t arcstat_hash_collisions;
270 kstat_named_t arcstat_hash_chains;
271 kstat_named_t arcstat_hash_chain_max;
272 kstat_named_t arcstat_p;
273 kstat_named_t arcstat_c;
274 kstat_named_t arcstat_c_min;
275 kstat_named_t arcstat_c_max;
276 kstat_named_t arcstat_size;
277 kstat_named_t arcstat_hdr_size;
d164b209
BB
278 kstat_named_t arcstat_data_size;
279 kstat_named_t arcstat_other_size;
13be560d
BB
280 kstat_named_t arcstat_anon_size;
281 kstat_named_t arcstat_anon_evict_data;
282 kstat_named_t arcstat_anon_evict_metadata;
283 kstat_named_t arcstat_mru_size;
284 kstat_named_t arcstat_mru_evict_data;
285 kstat_named_t arcstat_mru_evict_metadata;
286 kstat_named_t arcstat_mru_ghost_size;
287 kstat_named_t arcstat_mru_ghost_evict_data;
288 kstat_named_t arcstat_mru_ghost_evict_metadata;
289 kstat_named_t arcstat_mfu_size;
290 kstat_named_t arcstat_mfu_evict_data;
291 kstat_named_t arcstat_mfu_evict_metadata;
292 kstat_named_t arcstat_mfu_ghost_size;
293 kstat_named_t arcstat_mfu_ghost_evict_data;
294 kstat_named_t arcstat_mfu_ghost_evict_metadata;
34dc7c2f
BB
295 kstat_named_t arcstat_l2_hits;
296 kstat_named_t arcstat_l2_misses;
297 kstat_named_t arcstat_l2_feeds;
298 kstat_named_t arcstat_l2_rw_clash;
d164b209
BB
299 kstat_named_t arcstat_l2_read_bytes;
300 kstat_named_t arcstat_l2_write_bytes;
34dc7c2f
BB
301 kstat_named_t arcstat_l2_writes_sent;
302 kstat_named_t arcstat_l2_writes_done;
303 kstat_named_t arcstat_l2_writes_error;
304 kstat_named_t arcstat_l2_writes_hdr_miss;
305 kstat_named_t arcstat_l2_evict_lock_retry;
306 kstat_named_t arcstat_l2_evict_reading;
307 kstat_named_t arcstat_l2_free_on_write;
308 kstat_named_t arcstat_l2_abort_lowmem;
309 kstat_named_t arcstat_l2_cksum_bad;
310 kstat_named_t arcstat_l2_io_error;
311 kstat_named_t arcstat_l2_size;
312 kstat_named_t arcstat_l2_hdr_size;
313 kstat_named_t arcstat_memory_throttle_count;
7cb67b45
BB
314 kstat_named_t arcstat_memory_direct_count;
315 kstat_named_t arcstat_memory_indirect_count;
1834f2d8
BB
316 kstat_named_t arcstat_no_grow;
317 kstat_named_t arcstat_tempreserve;
318 kstat_named_t arcstat_loaned_bytes;
ab26409d 319 kstat_named_t arcstat_prune;
1834f2d8
BB
320 kstat_named_t arcstat_meta_used;
321 kstat_named_t arcstat_meta_limit;
322 kstat_named_t arcstat_meta_max;
34dc7c2f
BB
323} arc_stats_t;
324
325static arc_stats_t arc_stats = {
326 { "hits", KSTAT_DATA_UINT64 },
327 { "misses", KSTAT_DATA_UINT64 },
328 { "demand_data_hits", KSTAT_DATA_UINT64 },
329 { "demand_data_misses", KSTAT_DATA_UINT64 },
330 { "demand_metadata_hits", KSTAT_DATA_UINT64 },
331 { "demand_metadata_misses", KSTAT_DATA_UINT64 },
332 { "prefetch_data_hits", KSTAT_DATA_UINT64 },
333 { "prefetch_data_misses", KSTAT_DATA_UINT64 },
334 { "prefetch_metadata_hits", KSTAT_DATA_UINT64 },
335 { "prefetch_metadata_misses", KSTAT_DATA_UINT64 },
336 { "mru_hits", KSTAT_DATA_UINT64 },
337 { "mru_ghost_hits", KSTAT_DATA_UINT64 },
338 { "mfu_hits", KSTAT_DATA_UINT64 },
339 { "mfu_ghost_hits", KSTAT_DATA_UINT64 },
340 { "deleted", KSTAT_DATA_UINT64 },
341 { "recycle_miss", KSTAT_DATA_UINT64 },
342 { "mutex_miss", KSTAT_DATA_UINT64 },
343 { "evict_skip", KSTAT_DATA_UINT64 },
428870ff
BB
344 { "evict_l2_cached", KSTAT_DATA_UINT64 },
345 { "evict_l2_eligible", KSTAT_DATA_UINT64 },
346 { "evict_l2_ineligible", KSTAT_DATA_UINT64 },
34dc7c2f
BB
347 { "hash_elements", KSTAT_DATA_UINT64 },
348 { "hash_elements_max", KSTAT_DATA_UINT64 },
349 { "hash_collisions", KSTAT_DATA_UINT64 },
350 { "hash_chains", KSTAT_DATA_UINT64 },
351 { "hash_chain_max", KSTAT_DATA_UINT64 },
352 { "p", KSTAT_DATA_UINT64 },
353 { "c", KSTAT_DATA_UINT64 },
354 { "c_min", KSTAT_DATA_UINT64 },
355 { "c_max", KSTAT_DATA_UINT64 },
356 { "size", KSTAT_DATA_UINT64 },
357 { "hdr_size", KSTAT_DATA_UINT64 },
d164b209
BB
358 { "data_size", KSTAT_DATA_UINT64 },
359 { "other_size", KSTAT_DATA_UINT64 },
13be560d
BB
360 { "anon_size", KSTAT_DATA_UINT64 },
361 { "anon_evict_data", KSTAT_DATA_UINT64 },
362 { "anon_evict_metadata", KSTAT_DATA_UINT64 },
363 { "mru_size", KSTAT_DATA_UINT64 },
364 { "mru_evict_data", KSTAT_DATA_UINT64 },
365 { "mru_evict_metadata", KSTAT_DATA_UINT64 },
366 { "mru_ghost_size", KSTAT_DATA_UINT64 },
367 { "mru_ghost_evict_data", KSTAT_DATA_UINT64 },
368 { "mru_ghost_evict_metadata", KSTAT_DATA_UINT64 },
369 { "mfu_size", KSTAT_DATA_UINT64 },
370 { "mfu_evict_data", KSTAT_DATA_UINT64 },
371 { "mfu_evict_metadata", KSTAT_DATA_UINT64 },
372 { "mfu_ghost_size", KSTAT_DATA_UINT64 },
373 { "mfu_ghost_evict_data", KSTAT_DATA_UINT64 },
374 { "mfu_ghost_evict_metadata", KSTAT_DATA_UINT64 },
34dc7c2f
BB
375 { "l2_hits", KSTAT_DATA_UINT64 },
376 { "l2_misses", KSTAT_DATA_UINT64 },
377 { "l2_feeds", KSTAT_DATA_UINT64 },
378 { "l2_rw_clash", KSTAT_DATA_UINT64 },
d164b209
BB
379 { "l2_read_bytes", KSTAT_DATA_UINT64 },
380 { "l2_write_bytes", KSTAT_DATA_UINT64 },
34dc7c2f
BB
381 { "l2_writes_sent", KSTAT_DATA_UINT64 },
382 { "l2_writes_done", KSTAT_DATA_UINT64 },
383 { "l2_writes_error", KSTAT_DATA_UINT64 },
384 { "l2_writes_hdr_miss", KSTAT_DATA_UINT64 },
385 { "l2_evict_lock_retry", KSTAT_DATA_UINT64 },
386 { "l2_evict_reading", KSTAT_DATA_UINT64 },
387 { "l2_free_on_write", KSTAT_DATA_UINT64 },
388 { "l2_abort_lowmem", KSTAT_DATA_UINT64 },
389 { "l2_cksum_bad", KSTAT_DATA_UINT64 },
390 { "l2_io_error", KSTAT_DATA_UINT64 },
391 { "l2_size", KSTAT_DATA_UINT64 },
392 { "l2_hdr_size", KSTAT_DATA_UINT64 },
1834f2d8 393 { "memory_throttle_count", KSTAT_DATA_UINT64 },
7cb67b45
BB
394 { "memory_direct_count", KSTAT_DATA_UINT64 },
395 { "memory_indirect_count", KSTAT_DATA_UINT64 },
1834f2d8
BB
396 { "arc_no_grow", KSTAT_DATA_UINT64 },
397 { "arc_tempreserve", KSTAT_DATA_UINT64 },
398 { "arc_loaned_bytes", KSTAT_DATA_UINT64 },
ab26409d 399 { "arc_prune", KSTAT_DATA_UINT64 },
1834f2d8
BB
400 { "arc_meta_used", KSTAT_DATA_UINT64 },
401 { "arc_meta_limit", KSTAT_DATA_UINT64 },
402 { "arc_meta_max", KSTAT_DATA_UINT64 },
34dc7c2f
BB
403};
404
405#define ARCSTAT(stat) (arc_stats.stat.value.ui64)
406
407#define ARCSTAT_INCR(stat, val) \
408 atomic_add_64(&arc_stats.stat.value.ui64, (val));
409
428870ff 410#define ARCSTAT_BUMP(stat) ARCSTAT_INCR(stat, 1)
34dc7c2f
BB
411#define ARCSTAT_BUMPDOWN(stat) ARCSTAT_INCR(stat, -1)
412
413#define ARCSTAT_MAX(stat, val) { \
414 uint64_t m; \
415 while ((val) > (m = arc_stats.stat.value.ui64) && \
416 (m != atomic_cas_64(&arc_stats.stat.value.ui64, m, (val)))) \
417 continue; \
418}
419
420#define ARCSTAT_MAXSTAT(stat) \
421 ARCSTAT_MAX(stat##_max, arc_stats.stat.value.ui64)
422
423/*
424 * We define a macro to allow ARC hits/misses to be easily broken down by
425 * two separate conditions, giving a total of four different subtypes for
426 * each of hits and misses (so eight statistics total).
427 */
428#define ARCSTAT_CONDSTAT(cond1, stat1, notstat1, cond2, stat2, notstat2, stat) \
429 if (cond1) { \
430 if (cond2) { \
431 ARCSTAT_BUMP(arcstat_##stat1##_##stat2##_##stat); \
432 } else { \
433 ARCSTAT_BUMP(arcstat_##stat1##_##notstat2##_##stat); \
434 } \
435 } else { \
436 if (cond2) { \
437 ARCSTAT_BUMP(arcstat_##notstat1##_##stat2##_##stat); \
438 } else { \
439 ARCSTAT_BUMP(arcstat_##notstat1##_##notstat2##_##stat);\
440 } \
441 }
442
443kstat_t *arc_ksp;
428870ff 444static arc_state_t *arc_anon;
34dc7c2f
BB
445static arc_state_t *arc_mru;
446static arc_state_t *arc_mru_ghost;
447static arc_state_t *arc_mfu;
448static arc_state_t *arc_mfu_ghost;
449static arc_state_t *arc_l2c_only;
450
451/*
452 * There are several ARC variables that are critical to export as kstats --
453 * but we don't want to have to grovel around in the kstat whenever we wish to
454 * manipulate them. For these variables, we therefore define them to be in
455 * terms of the statistic variable. This assures that we are not introducing
456 * the possibility of inconsistency by having shadow copies of the variables,
457 * while still allowing the code to be readable.
458 */
459#define arc_size ARCSTAT(arcstat_size) /* actual total arc size */
460#define arc_p ARCSTAT(arcstat_p) /* target size of MRU */
461#define arc_c ARCSTAT(arcstat_c) /* target size of cache */
462#define arc_c_min ARCSTAT(arcstat_c_min) /* min target cache size */
463#define arc_c_max ARCSTAT(arcstat_c_max) /* max target cache size */
1834f2d8
BB
464#define arc_no_grow ARCSTAT(arcstat_no_grow)
465#define arc_tempreserve ARCSTAT(arcstat_tempreserve)
466#define arc_loaned_bytes ARCSTAT(arcstat_loaned_bytes)
467#define arc_meta_used ARCSTAT(arcstat_meta_used)
468#define arc_meta_limit ARCSTAT(arcstat_meta_limit)
469#define arc_meta_max ARCSTAT(arcstat_meta_max)
34dc7c2f
BB
470
471typedef struct l2arc_buf_hdr l2arc_buf_hdr_t;
472
473typedef struct arc_callback arc_callback_t;
474
475struct arc_callback {
476 void *acb_private;
477 arc_done_func_t *acb_done;
34dc7c2f
BB
478 arc_buf_t *acb_buf;
479 zio_t *acb_zio_dummy;
480 arc_callback_t *acb_next;
481};
482
483typedef struct arc_write_callback arc_write_callback_t;
484
485struct arc_write_callback {
486 void *awcb_private;
487 arc_done_func_t *awcb_ready;
488 arc_done_func_t *awcb_done;
489 arc_buf_t *awcb_buf;
490};
491
492struct arc_buf_hdr {
493 /* protected by hash lock */
494 dva_t b_dva;
495 uint64_t b_birth;
496 uint64_t b_cksum0;
497
498 kmutex_t b_freeze_lock;
499 zio_cksum_t *b_freeze_cksum;
428870ff 500 void *b_thawed;
34dc7c2f
BB
501
502 arc_buf_hdr_t *b_hash_next;
503 arc_buf_t *b_buf;
504 uint32_t b_flags;
505 uint32_t b_datacnt;
506
507 arc_callback_t *b_acb;
508 kcondvar_t b_cv;
509
510 /* immutable */
511 arc_buf_contents_t b_type;
512 uint64_t b_size;
d164b209 513 uint64_t b_spa;
34dc7c2f
BB
514
515 /* protected by arc state mutex */
516 arc_state_t *b_state;
517 list_node_t b_arc_node;
518
519 /* updated atomically */
520 clock_t b_arc_access;
521
522 /* self protecting */
523 refcount_t b_refcnt;
524
525 l2arc_buf_hdr_t *b_l2hdr;
526 list_node_t b_l2node;
527};
528
ab26409d
BB
529static list_t arc_prune_list;
530static kmutex_t arc_prune_mtx;
34dc7c2f
BB
531static arc_buf_t *arc_eviction_list;
532static kmutex_t arc_eviction_mtx;
533static arc_buf_hdr_t arc_eviction_hdr;
534static void arc_get_data_buf(arc_buf_t *buf);
535static void arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock);
536static int arc_evict_needed(arc_buf_contents_t type);
d164b209 537static void arc_evict_ghost(arc_state_t *state, uint64_t spa, int64_t bytes);
34dc7c2f 538
428870ff
BB
539static boolean_t l2arc_write_eligible(uint64_t spa_guid, arc_buf_hdr_t *ab);
540
34dc7c2f
BB
541#define GHOST_STATE(state) \
542 ((state) == arc_mru_ghost || (state) == arc_mfu_ghost || \
543 (state) == arc_l2c_only)
544
545/*
546 * Private ARC flags. These flags are private ARC only flags that will show up
547 * in b_flags in the arc_hdr_buf_t. Some flags are publicly declared, and can
548 * be passed in as arc_flags in things like arc_read. However, these flags
549 * should never be passed and should only be set by ARC code. When adding new
550 * public flags, make sure not to smash the private ones.
551 */
552
553#define ARC_IN_HASH_TABLE (1 << 9) /* this buffer is hashed */
554#define ARC_IO_IN_PROGRESS (1 << 10) /* I/O in progress for buf */
555#define ARC_IO_ERROR (1 << 11) /* I/O failed for buf */
556#define ARC_FREED_IN_READ (1 << 12) /* buf freed while in read */
557#define ARC_BUF_AVAILABLE (1 << 13) /* block not in active use */
558#define ARC_INDIRECT (1 << 14) /* this is an indirect block */
559#define ARC_FREE_IN_PROGRESS (1 << 15) /* hdr about to be freed */
b128c09f
BB
560#define ARC_L2_WRITING (1 << 16) /* L2ARC write in progress */
561#define ARC_L2_EVICTED (1 << 17) /* evicted during I/O */
562#define ARC_L2_WRITE_HEAD (1 << 18) /* head of write list */
34dc7c2f
BB
563
564#define HDR_IN_HASH_TABLE(hdr) ((hdr)->b_flags & ARC_IN_HASH_TABLE)
565#define HDR_IO_IN_PROGRESS(hdr) ((hdr)->b_flags & ARC_IO_IN_PROGRESS)
566#define HDR_IO_ERROR(hdr) ((hdr)->b_flags & ARC_IO_ERROR)
d164b209 567#define HDR_PREFETCH(hdr) ((hdr)->b_flags & ARC_PREFETCH)
34dc7c2f
BB
568#define HDR_FREED_IN_READ(hdr) ((hdr)->b_flags & ARC_FREED_IN_READ)
569#define HDR_BUF_AVAILABLE(hdr) ((hdr)->b_flags & ARC_BUF_AVAILABLE)
570#define HDR_FREE_IN_PROGRESS(hdr) ((hdr)->b_flags & ARC_FREE_IN_PROGRESS)
b128c09f
BB
571#define HDR_L2CACHE(hdr) ((hdr)->b_flags & ARC_L2CACHE)
572#define HDR_L2_READING(hdr) ((hdr)->b_flags & ARC_IO_IN_PROGRESS && \
573 (hdr)->b_l2hdr != NULL)
34dc7c2f
BB
574#define HDR_L2_WRITING(hdr) ((hdr)->b_flags & ARC_L2_WRITING)
575#define HDR_L2_EVICTED(hdr) ((hdr)->b_flags & ARC_L2_EVICTED)
576#define HDR_L2_WRITE_HEAD(hdr) ((hdr)->b_flags & ARC_L2_WRITE_HEAD)
577
578/*
579 * Other sizes
580 */
581
582#define HDR_SIZE ((int64_t)sizeof (arc_buf_hdr_t))
583#define L2HDR_SIZE ((int64_t)sizeof (l2arc_buf_hdr_t))
584
585/*
586 * Hash table routines
587 */
588
00b46022
BB
589#define HT_LOCK_ALIGN 64
590#define HT_LOCK_PAD (P2NPHASE(sizeof (kmutex_t), (HT_LOCK_ALIGN)))
34dc7c2f
BB
591
592struct ht_lock {
593 kmutex_t ht_lock;
594#ifdef _KERNEL
00b46022 595 unsigned char pad[HT_LOCK_PAD];
34dc7c2f
BB
596#endif
597};
598
599#define BUF_LOCKS 256
600typedef struct buf_hash_table {
601 uint64_t ht_mask;
602 arc_buf_hdr_t **ht_table;
603 struct ht_lock ht_locks[BUF_LOCKS];
604} buf_hash_table_t;
605
606static buf_hash_table_t buf_hash_table;
607
608#define BUF_HASH_INDEX(spa, dva, birth) \
609 (buf_hash(spa, dva, birth) & buf_hash_table.ht_mask)
610#define BUF_HASH_LOCK_NTRY(idx) (buf_hash_table.ht_locks[idx & (BUF_LOCKS-1)])
611#define BUF_HASH_LOCK(idx) (&(BUF_HASH_LOCK_NTRY(idx).ht_lock))
428870ff
BB
612#define HDR_LOCK(hdr) \
613 (BUF_HASH_LOCK(BUF_HASH_INDEX(hdr->b_spa, &hdr->b_dva, hdr->b_birth)))
34dc7c2f
BB
614
615uint64_t zfs_crc64_table[256];
616
617/*
618 * Level 2 ARC
619 */
620
621#define L2ARC_WRITE_SIZE (8 * 1024 * 1024) /* initial write max */
d164b209
BB
622#define L2ARC_HEADROOM 2 /* num of writes */
623#define L2ARC_FEED_SECS 1 /* caching interval secs */
624#define L2ARC_FEED_MIN_MS 200 /* min caching interval ms */
34dc7c2f
BB
625
626#define l2arc_writes_sent ARCSTAT(arcstat_l2_writes_sent)
627#define l2arc_writes_done ARCSTAT(arcstat_l2_writes_done)
628
629/*
630 * L2ARC Performance Tunables
631 */
abd8610c
BB
632unsigned long l2arc_write_max = L2ARC_WRITE_SIZE; /* def max write size */
633unsigned long l2arc_write_boost = L2ARC_WRITE_SIZE; /* extra warmup write */
634unsigned long l2arc_headroom = L2ARC_HEADROOM; /* # of dev writes */
635unsigned long l2arc_feed_secs = L2ARC_FEED_SECS; /* interval seconds */
636unsigned long l2arc_feed_min_ms = L2ARC_FEED_MIN_MS; /* min interval msecs */
637int l2arc_noprefetch = B_TRUE; /* don't cache prefetch bufs */
638int l2arc_feed_again = B_TRUE; /* turbo warmup */
639int l2arc_norw = B_TRUE; /* no reads during writes */
34dc7c2f
BB
640
641/*
642 * L2ARC Internals
643 */
644typedef struct l2arc_dev {
645 vdev_t *l2ad_vdev; /* vdev */
646 spa_t *l2ad_spa; /* spa */
647 uint64_t l2ad_hand; /* next write location */
648 uint64_t l2ad_write; /* desired write size, bytes */
b128c09f 649 uint64_t l2ad_boost; /* warmup write boost, bytes */
34dc7c2f
BB
650 uint64_t l2ad_start; /* first addr on device */
651 uint64_t l2ad_end; /* last addr on device */
652 uint64_t l2ad_evict; /* last addr eviction reached */
653 boolean_t l2ad_first; /* first sweep through */
d164b209 654 boolean_t l2ad_writing; /* currently writing */
34dc7c2f
BB
655 list_t *l2ad_buflist; /* buffer list */
656 list_node_t l2ad_node; /* device list node */
657} l2arc_dev_t;
658
659static list_t L2ARC_dev_list; /* device list */
660static list_t *l2arc_dev_list; /* device list pointer */
661static kmutex_t l2arc_dev_mtx; /* device list mutex */
662static l2arc_dev_t *l2arc_dev_last; /* last device used */
663static kmutex_t l2arc_buflist_mtx; /* mutex for all buflists */
664static list_t L2ARC_free_on_write; /* free after write buf list */
665static list_t *l2arc_free_on_write; /* free after write list ptr */
666static kmutex_t l2arc_free_on_write_mtx; /* mutex for list */
667static uint64_t l2arc_ndev; /* number of devices */
668
669typedef struct l2arc_read_callback {
670 arc_buf_t *l2rcb_buf; /* read buffer */
671 spa_t *l2rcb_spa; /* spa */
672 blkptr_t l2rcb_bp; /* original blkptr */
673 zbookmark_t l2rcb_zb; /* original bookmark */
674 int l2rcb_flags; /* original flags */
675} l2arc_read_callback_t;
676
677typedef struct l2arc_write_callback {
678 l2arc_dev_t *l2wcb_dev; /* device info */
679 arc_buf_hdr_t *l2wcb_head; /* head of write buflist */
680} l2arc_write_callback_t;
681
682struct l2arc_buf_hdr {
683 /* protected by arc_buf_hdr mutex */
684 l2arc_dev_t *b_dev; /* L2ARC device */
9babb374 685 uint64_t b_daddr; /* disk address, offset byte */
34dc7c2f
BB
686};
687
688typedef struct l2arc_data_free {
689 /* protected by l2arc_free_on_write_mtx */
690 void *l2df_data;
691 size_t l2df_size;
692 void (*l2df_func)(void *, size_t);
693 list_node_t l2df_list_node;
694} l2arc_data_free_t;
695
696static kmutex_t l2arc_feed_thr_lock;
697static kcondvar_t l2arc_feed_thr_cv;
698static uint8_t l2arc_thread_exit;
699
700static void l2arc_read_done(zio_t *zio);
701static void l2arc_hdr_stat_add(void);
702static void l2arc_hdr_stat_remove(void);
703
704static uint64_t
d164b209 705buf_hash(uint64_t spa, const dva_t *dva, uint64_t birth)
34dc7c2f 706{
34dc7c2f
BB
707 uint8_t *vdva = (uint8_t *)dva;
708 uint64_t crc = -1ULL;
709 int i;
710
711 ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY);
712
713 for (i = 0; i < sizeof (dva_t); i++)
714 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ vdva[i]) & 0xFF];
715
d164b209 716 crc ^= (spa>>8) ^ birth;
34dc7c2f
BB
717
718 return (crc);
719}
720
721#define BUF_EMPTY(buf) \
722 ((buf)->b_dva.dva_word[0] == 0 && \
723 (buf)->b_dva.dva_word[1] == 0 && \
724 (buf)->b_birth == 0)
725
726#define BUF_EQUAL(spa, dva, birth, buf) \
727 ((buf)->b_dva.dva_word[0] == (dva)->dva_word[0]) && \
728 ((buf)->b_dva.dva_word[1] == (dva)->dva_word[1]) && \
729 ((buf)->b_birth == birth) && ((buf)->b_spa == spa)
730
428870ff
BB
731static void
732buf_discard_identity(arc_buf_hdr_t *hdr)
733{
734 hdr->b_dva.dva_word[0] = 0;
735 hdr->b_dva.dva_word[1] = 0;
736 hdr->b_birth = 0;
737 hdr->b_cksum0 = 0;
738}
739
34dc7c2f 740static arc_buf_hdr_t *
d164b209 741buf_hash_find(uint64_t spa, const dva_t *dva, uint64_t birth, kmutex_t **lockp)
34dc7c2f
BB
742{
743 uint64_t idx = BUF_HASH_INDEX(spa, dva, birth);
744 kmutex_t *hash_lock = BUF_HASH_LOCK(idx);
745 arc_buf_hdr_t *buf;
746
747 mutex_enter(hash_lock);
748 for (buf = buf_hash_table.ht_table[idx]; buf != NULL;
749 buf = buf->b_hash_next) {
750 if (BUF_EQUAL(spa, dva, birth, buf)) {
751 *lockp = hash_lock;
752 return (buf);
753 }
754 }
755 mutex_exit(hash_lock);
756 *lockp = NULL;
757 return (NULL);
758}
759
760/*
761 * Insert an entry into the hash table. If there is already an element
762 * equal to elem in the hash table, then the already existing element
763 * will be returned and the new element will not be inserted.
764 * Otherwise returns NULL.
765 */
766static arc_buf_hdr_t *
767buf_hash_insert(arc_buf_hdr_t *buf, kmutex_t **lockp)
768{
769 uint64_t idx = BUF_HASH_INDEX(buf->b_spa, &buf->b_dva, buf->b_birth);
770 kmutex_t *hash_lock = BUF_HASH_LOCK(idx);
771 arc_buf_hdr_t *fbuf;
772 uint32_t i;
773
774 ASSERT(!HDR_IN_HASH_TABLE(buf));
775 *lockp = hash_lock;
776 mutex_enter(hash_lock);
777 for (fbuf = buf_hash_table.ht_table[idx], i = 0; fbuf != NULL;
778 fbuf = fbuf->b_hash_next, i++) {
779 if (BUF_EQUAL(buf->b_spa, &buf->b_dva, buf->b_birth, fbuf))
780 return (fbuf);
781 }
782
783 buf->b_hash_next = buf_hash_table.ht_table[idx];
784 buf_hash_table.ht_table[idx] = buf;
785 buf->b_flags |= ARC_IN_HASH_TABLE;
786
787 /* collect some hash table performance data */
788 if (i > 0) {
789 ARCSTAT_BUMP(arcstat_hash_collisions);
790 if (i == 1)
791 ARCSTAT_BUMP(arcstat_hash_chains);
792
793 ARCSTAT_MAX(arcstat_hash_chain_max, i);
794 }
795
796 ARCSTAT_BUMP(arcstat_hash_elements);
797 ARCSTAT_MAXSTAT(arcstat_hash_elements);
798
799 return (NULL);
800}
801
802static void
803buf_hash_remove(arc_buf_hdr_t *buf)
804{
805 arc_buf_hdr_t *fbuf, **bufp;
806 uint64_t idx = BUF_HASH_INDEX(buf->b_spa, &buf->b_dva, buf->b_birth);
807
808 ASSERT(MUTEX_HELD(BUF_HASH_LOCK(idx)));
809 ASSERT(HDR_IN_HASH_TABLE(buf));
810
811 bufp = &buf_hash_table.ht_table[idx];
812 while ((fbuf = *bufp) != buf) {
813 ASSERT(fbuf != NULL);
814 bufp = &fbuf->b_hash_next;
815 }
816 *bufp = buf->b_hash_next;
817 buf->b_hash_next = NULL;
818 buf->b_flags &= ~ARC_IN_HASH_TABLE;
819
820 /* collect some hash table performance data */
821 ARCSTAT_BUMPDOWN(arcstat_hash_elements);
822
823 if (buf_hash_table.ht_table[idx] &&
824 buf_hash_table.ht_table[idx]->b_hash_next == NULL)
825 ARCSTAT_BUMPDOWN(arcstat_hash_chains);
826}
827
828/*
829 * Global data structures and functions for the buf kmem cache.
830 */
831static kmem_cache_t *hdr_cache;
832static kmem_cache_t *buf_cache;
833
834static void
835buf_fini(void)
836{
837 int i;
838
00b46022
BB
839#if defined(_KERNEL) && defined(HAVE_SPL)
840 /* Large allocations which do not require contiguous pages
841 * should be using vmem_free() in the linux kernel */
842 vmem_free(buf_hash_table.ht_table,
843 (buf_hash_table.ht_mask + 1) * sizeof (void *));
844#else
34dc7c2f
BB
845 kmem_free(buf_hash_table.ht_table,
846 (buf_hash_table.ht_mask + 1) * sizeof (void *));
00b46022 847#endif
34dc7c2f
BB
848 for (i = 0; i < BUF_LOCKS; i++)
849 mutex_destroy(&buf_hash_table.ht_locks[i].ht_lock);
850 kmem_cache_destroy(hdr_cache);
851 kmem_cache_destroy(buf_cache);
852}
853
854/*
855 * Constructor callback - called when the cache is empty
856 * and a new buf is requested.
857 */
858/* ARGSUSED */
859static int
860hdr_cons(void *vbuf, void *unused, int kmflag)
861{
862 arc_buf_hdr_t *buf = vbuf;
863
864 bzero(buf, sizeof (arc_buf_hdr_t));
865 refcount_create(&buf->b_refcnt);
866 cv_init(&buf->b_cv, NULL, CV_DEFAULT, NULL);
867 mutex_init(&buf->b_freeze_lock, NULL, MUTEX_DEFAULT, NULL);
98f72a53
BB
868 list_link_init(&buf->b_arc_node);
869 list_link_init(&buf->b_l2node);
d164b209 870 arc_space_consume(sizeof (arc_buf_hdr_t), ARC_SPACE_HDRS);
34dc7c2f 871
34dc7c2f
BB
872 return (0);
873}
874
b128c09f
BB
875/* ARGSUSED */
876static int
877buf_cons(void *vbuf, void *unused, int kmflag)
878{
879 arc_buf_t *buf = vbuf;
880
881 bzero(buf, sizeof (arc_buf_t));
428870ff
BB
882 mutex_init(&buf->b_evict_lock, NULL, MUTEX_DEFAULT, NULL);
883 rw_init(&buf->b_data_lock, NULL, RW_DEFAULT, NULL);
d164b209
BB
884 arc_space_consume(sizeof (arc_buf_t), ARC_SPACE_HDRS);
885
b128c09f
BB
886 return (0);
887}
888
34dc7c2f
BB
889/*
890 * Destructor callback - called when a cached buf is
891 * no longer required.
892 */
893/* ARGSUSED */
894static void
895hdr_dest(void *vbuf, void *unused)
896{
897 arc_buf_hdr_t *buf = vbuf;
898
428870ff 899 ASSERT(BUF_EMPTY(buf));
34dc7c2f
BB
900 refcount_destroy(&buf->b_refcnt);
901 cv_destroy(&buf->b_cv);
902 mutex_destroy(&buf->b_freeze_lock);
d164b209 903 arc_space_return(sizeof (arc_buf_hdr_t), ARC_SPACE_HDRS);
34dc7c2f
BB
904}
905
b128c09f
BB
906/* ARGSUSED */
907static void
908buf_dest(void *vbuf, void *unused)
909{
910 arc_buf_t *buf = vbuf;
911
428870ff
BB
912 mutex_destroy(&buf->b_evict_lock);
913 rw_destroy(&buf->b_data_lock);
d164b209 914 arc_space_return(sizeof (arc_buf_t), ARC_SPACE_HDRS);
b128c09f
BB
915}
916
34dc7c2f
BB
917static void
918buf_init(void)
919{
920 uint64_t *ct;
921 uint64_t hsize = 1ULL << 12;
922 int i, j;
923
924 /*
925 * The hash table is big enough to fill all of physical memory
926 * with an average 64K block size. The table will take up
927 * totalmem*sizeof(void*)/64K (eg. 128KB/GB with 8-byte pointers).
928 */
929 while (hsize * 65536 < physmem * PAGESIZE)
930 hsize <<= 1;
931retry:
932 buf_hash_table.ht_mask = hsize - 1;
00b46022
BB
933#if defined(_KERNEL) && defined(HAVE_SPL)
934 /* Large allocations which do not require contiguous pages
935 * should be using vmem_alloc() in the linux kernel */
936 buf_hash_table.ht_table =
937 vmem_zalloc(hsize * sizeof (void*), KM_SLEEP);
938#else
34dc7c2f
BB
939 buf_hash_table.ht_table =
940 kmem_zalloc(hsize * sizeof (void*), KM_NOSLEEP);
00b46022 941#endif
34dc7c2f
BB
942 if (buf_hash_table.ht_table == NULL) {
943 ASSERT(hsize > (1ULL << 8));
944 hsize >>= 1;
945 goto retry;
946 }
947
948 hdr_cache = kmem_cache_create("arc_buf_hdr_t", sizeof (arc_buf_hdr_t),
302f753f 949 0, hdr_cons, hdr_dest, NULL, NULL, NULL, 0);
34dc7c2f 950 buf_cache = kmem_cache_create("arc_buf_t", sizeof (arc_buf_t),
b128c09f 951 0, buf_cons, buf_dest, NULL, NULL, NULL, 0);
34dc7c2f
BB
952
953 for (i = 0; i < 256; i++)
954 for (ct = zfs_crc64_table + i, *ct = i, j = 8; j > 0; j--)
955 *ct = (*ct >> 1) ^ (-(*ct & 1) & ZFS_CRC64_POLY);
956
957 for (i = 0; i < BUF_LOCKS; i++) {
958 mutex_init(&buf_hash_table.ht_locks[i].ht_lock,
959 NULL, MUTEX_DEFAULT, NULL);
960 }
961}
962
963#define ARC_MINTIME (hz>>4) /* 62 ms */
964
965static void
966arc_cksum_verify(arc_buf_t *buf)
967{
968 zio_cksum_t zc;
969
970 if (!(zfs_flags & ZFS_DEBUG_MODIFY))
971 return;
972
973 mutex_enter(&buf->b_hdr->b_freeze_lock);
974 if (buf->b_hdr->b_freeze_cksum == NULL ||
975 (buf->b_hdr->b_flags & ARC_IO_ERROR)) {
976 mutex_exit(&buf->b_hdr->b_freeze_lock);
977 return;
978 }
979 fletcher_2_native(buf->b_data, buf->b_hdr->b_size, &zc);
980 if (!ZIO_CHECKSUM_EQUAL(*buf->b_hdr->b_freeze_cksum, zc))
981 panic("buffer modified while frozen!");
982 mutex_exit(&buf->b_hdr->b_freeze_lock);
983}
984
985static int
986arc_cksum_equal(arc_buf_t *buf)
987{
988 zio_cksum_t zc;
989 int equal;
990
991 mutex_enter(&buf->b_hdr->b_freeze_lock);
992 fletcher_2_native(buf->b_data, buf->b_hdr->b_size, &zc);
993 equal = ZIO_CHECKSUM_EQUAL(*buf->b_hdr->b_freeze_cksum, zc);
994 mutex_exit(&buf->b_hdr->b_freeze_lock);
995
996 return (equal);
997}
998
999static void
1000arc_cksum_compute(arc_buf_t *buf, boolean_t force)
1001{
1002 if (!force && !(zfs_flags & ZFS_DEBUG_MODIFY))
1003 return;
1004
1005 mutex_enter(&buf->b_hdr->b_freeze_lock);
1006 if (buf->b_hdr->b_freeze_cksum != NULL) {
1007 mutex_exit(&buf->b_hdr->b_freeze_lock);
1008 return;
1009 }
409dc1a5
PS
1010 buf->b_hdr->b_freeze_cksum = kmem_alloc(sizeof (zio_cksum_t),
1011 KM_PUSHPAGE);
34dc7c2f
BB
1012 fletcher_2_native(buf->b_data, buf->b_hdr->b_size,
1013 buf->b_hdr->b_freeze_cksum);
1014 mutex_exit(&buf->b_hdr->b_freeze_lock);
1015}
1016
1017void
1018arc_buf_thaw(arc_buf_t *buf)
1019{
1020 if (zfs_flags & ZFS_DEBUG_MODIFY) {
1021 if (buf->b_hdr->b_state != arc_anon)
1022 panic("modifying non-anon buffer!");
1023 if (buf->b_hdr->b_flags & ARC_IO_IN_PROGRESS)
1024 panic("modifying buffer while i/o in progress!");
1025 arc_cksum_verify(buf);
1026 }
1027
1028 mutex_enter(&buf->b_hdr->b_freeze_lock);
1029 if (buf->b_hdr->b_freeze_cksum != NULL) {
1030 kmem_free(buf->b_hdr->b_freeze_cksum, sizeof (zio_cksum_t));
1031 buf->b_hdr->b_freeze_cksum = NULL;
1032 }
428870ff
BB
1033
1034 if (zfs_flags & ZFS_DEBUG_MODIFY) {
1035 if (buf->b_hdr->b_thawed)
1036 kmem_free(buf->b_hdr->b_thawed, 1);
1037 buf->b_hdr->b_thawed = kmem_alloc(1, KM_SLEEP);
1038 }
1039
34dc7c2f
BB
1040 mutex_exit(&buf->b_hdr->b_freeze_lock);
1041}
1042
1043void
1044arc_buf_freeze(arc_buf_t *buf)
1045{
428870ff
BB
1046 kmutex_t *hash_lock;
1047
34dc7c2f
BB
1048 if (!(zfs_flags & ZFS_DEBUG_MODIFY))
1049 return;
1050
428870ff
BB
1051 hash_lock = HDR_LOCK(buf->b_hdr);
1052 mutex_enter(hash_lock);
1053
34dc7c2f
BB
1054 ASSERT(buf->b_hdr->b_freeze_cksum != NULL ||
1055 buf->b_hdr->b_state == arc_anon);
1056 arc_cksum_compute(buf, B_FALSE);
428870ff 1057 mutex_exit(hash_lock);
34dc7c2f
BB
1058}
1059
1060static void
1061add_reference(arc_buf_hdr_t *ab, kmutex_t *hash_lock, void *tag)
1062{
1063 ASSERT(MUTEX_HELD(hash_lock));
1064
1065 if ((refcount_add(&ab->b_refcnt, tag) == 1) &&
1066 (ab->b_state != arc_anon)) {
1067 uint64_t delta = ab->b_size * ab->b_datacnt;
1068 list_t *list = &ab->b_state->arcs_list[ab->b_type];
1069 uint64_t *size = &ab->b_state->arcs_lsize[ab->b_type];
1070
1071 ASSERT(!MUTEX_HELD(&ab->b_state->arcs_mtx));
1072 mutex_enter(&ab->b_state->arcs_mtx);
1073 ASSERT(list_link_active(&ab->b_arc_node));
1074 list_remove(list, ab);
1075 if (GHOST_STATE(ab->b_state)) {
1076 ASSERT3U(ab->b_datacnt, ==, 0);
1077 ASSERT3P(ab->b_buf, ==, NULL);
1078 delta = ab->b_size;
1079 }
1080 ASSERT(delta > 0);
1081 ASSERT3U(*size, >=, delta);
1082 atomic_add_64(size, -delta);
1083 mutex_exit(&ab->b_state->arcs_mtx);
b128c09f 1084 /* remove the prefetch flag if we get a reference */
34dc7c2f
BB
1085 if (ab->b_flags & ARC_PREFETCH)
1086 ab->b_flags &= ~ARC_PREFETCH;
1087 }
1088}
1089
1090static int
1091remove_reference(arc_buf_hdr_t *ab, kmutex_t *hash_lock, void *tag)
1092{
1093 int cnt;
1094 arc_state_t *state = ab->b_state;
1095
1096 ASSERT(state == arc_anon || MUTEX_HELD(hash_lock));
1097 ASSERT(!GHOST_STATE(state));
1098
1099 if (((cnt = refcount_remove(&ab->b_refcnt, tag)) == 0) &&
1100 (state != arc_anon)) {
1101 uint64_t *size = &state->arcs_lsize[ab->b_type];
1102
1103 ASSERT(!MUTEX_HELD(&state->arcs_mtx));
1104 mutex_enter(&state->arcs_mtx);
1105 ASSERT(!list_link_active(&ab->b_arc_node));
1106 list_insert_head(&state->arcs_list[ab->b_type], ab);
1107 ASSERT(ab->b_datacnt > 0);
1108 atomic_add_64(size, ab->b_size * ab->b_datacnt);
1109 mutex_exit(&state->arcs_mtx);
1110 }
1111 return (cnt);
1112}
1113
1114/*
1115 * Move the supplied buffer to the indicated state. The mutex
1116 * for the buffer must be held by the caller.
1117 */
1118static void
1119arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *ab, kmutex_t *hash_lock)
1120{
1121 arc_state_t *old_state = ab->b_state;
1122 int64_t refcnt = refcount_count(&ab->b_refcnt);
1123 uint64_t from_delta, to_delta;
1124
1125 ASSERT(MUTEX_HELD(hash_lock));
1126 ASSERT(new_state != old_state);
1127 ASSERT(refcnt == 0 || ab->b_datacnt > 0);
1128 ASSERT(ab->b_datacnt == 0 || !GHOST_STATE(new_state));
428870ff 1129 ASSERT(ab->b_datacnt <= 1 || old_state != arc_anon);
34dc7c2f
BB
1130
1131 from_delta = to_delta = ab->b_datacnt * ab->b_size;
1132
1133 /*
1134 * If this buffer is evictable, transfer it from the
1135 * old state list to the new state list.
1136 */
1137 if (refcnt == 0) {
1138 if (old_state != arc_anon) {
1139 int use_mutex = !MUTEX_HELD(&old_state->arcs_mtx);
1140 uint64_t *size = &old_state->arcs_lsize[ab->b_type];
1141
1142 if (use_mutex)
1143 mutex_enter(&old_state->arcs_mtx);
1144
1145 ASSERT(list_link_active(&ab->b_arc_node));
1146 list_remove(&old_state->arcs_list[ab->b_type], ab);
1147
1148 /*
1149 * If prefetching out of the ghost cache,
428870ff 1150 * we will have a non-zero datacnt.
34dc7c2f
BB
1151 */
1152 if (GHOST_STATE(old_state) && ab->b_datacnt == 0) {
1153 /* ghost elements have a ghost size */
1154 ASSERT(ab->b_buf == NULL);
1155 from_delta = ab->b_size;
1156 }
1157 ASSERT3U(*size, >=, from_delta);
1158 atomic_add_64(size, -from_delta);
1159
1160 if (use_mutex)
1161 mutex_exit(&old_state->arcs_mtx);
1162 }
1163 if (new_state != arc_anon) {
1164 int use_mutex = !MUTEX_HELD(&new_state->arcs_mtx);
1165 uint64_t *size = &new_state->arcs_lsize[ab->b_type];
1166
1167 if (use_mutex)
1168 mutex_enter(&new_state->arcs_mtx);
1169
1170 list_insert_head(&new_state->arcs_list[ab->b_type], ab);
1171
1172 /* ghost elements have a ghost size */
1173 if (GHOST_STATE(new_state)) {
1174 ASSERT(ab->b_datacnt == 0);
1175 ASSERT(ab->b_buf == NULL);
1176 to_delta = ab->b_size;
1177 }
1178 atomic_add_64(size, to_delta);
1179
1180 if (use_mutex)
1181 mutex_exit(&new_state->arcs_mtx);
1182 }
1183 }
1184
1185 ASSERT(!BUF_EMPTY(ab));
428870ff 1186 if (new_state == arc_anon && HDR_IN_HASH_TABLE(ab))
34dc7c2f 1187 buf_hash_remove(ab);
34dc7c2f
BB
1188
1189 /* adjust state sizes */
1190 if (to_delta)
1191 atomic_add_64(&new_state->arcs_size, to_delta);
1192 if (from_delta) {
1193 ASSERT3U(old_state->arcs_size, >=, from_delta);
1194 atomic_add_64(&old_state->arcs_size, -from_delta);
1195 }
1196 ab->b_state = new_state;
1197
1198 /* adjust l2arc hdr stats */
1199 if (new_state == arc_l2c_only)
1200 l2arc_hdr_stat_add();
1201 else if (old_state == arc_l2c_only)
1202 l2arc_hdr_stat_remove();
1203}
1204
1205void
d164b209 1206arc_space_consume(uint64_t space, arc_space_type_t type)
34dc7c2f 1207{
d164b209
BB
1208 ASSERT(type >= 0 && type < ARC_SPACE_NUMTYPES);
1209
1210 switch (type) {
e75c13c3
BB
1211 default:
1212 break;
d164b209
BB
1213 case ARC_SPACE_DATA:
1214 ARCSTAT_INCR(arcstat_data_size, space);
1215 break;
1216 case ARC_SPACE_OTHER:
1217 ARCSTAT_INCR(arcstat_other_size, space);
1218 break;
1219 case ARC_SPACE_HDRS:
1220 ARCSTAT_INCR(arcstat_hdr_size, space);
1221 break;
1222 case ARC_SPACE_L2HDRS:
1223 ARCSTAT_INCR(arcstat_l2_hdr_size, space);
1224 break;
1225 }
1226
34dc7c2f
BB
1227 atomic_add_64(&arc_meta_used, space);
1228 atomic_add_64(&arc_size, space);
1229}
1230
1231void
d164b209 1232arc_space_return(uint64_t space, arc_space_type_t type)
34dc7c2f 1233{
d164b209
BB
1234 ASSERT(type >= 0 && type < ARC_SPACE_NUMTYPES);
1235
1236 switch (type) {
e75c13c3
BB
1237 default:
1238 break;
d164b209
BB
1239 case ARC_SPACE_DATA:
1240 ARCSTAT_INCR(arcstat_data_size, -space);
1241 break;
1242 case ARC_SPACE_OTHER:
1243 ARCSTAT_INCR(arcstat_other_size, -space);
1244 break;
1245 case ARC_SPACE_HDRS:
1246 ARCSTAT_INCR(arcstat_hdr_size, -space);
1247 break;
1248 case ARC_SPACE_L2HDRS:
1249 ARCSTAT_INCR(arcstat_l2_hdr_size, -space);
1250 break;
1251 }
1252
34dc7c2f
BB
1253 ASSERT(arc_meta_used >= space);
1254 if (arc_meta_max < arc_meta_used)
1255 arc_meta_max = arc_meta_used;
1256 atomic_add_64(&arc_meta_used, -space);
1257 ASSERT(arc_size >= space);
1258 atomic_add_64(&arc_size, -space);
1259}
1260
1261void *
1262arc_data_buf_alloc(uint64_t size)
1263{
1264 if (arc_evict_needed(ARC_BUFC_DATA))
1265 cv_signal(&arc_reclaim_thr_cv);
1266 atomic_add_64(&arc_size, size);
1267 return (zio_data_buf_alloc(size));
1268}
1269
1270void
1271arc_data_buf_free(void *buf, uint64_t size)
1272{
1273 zio_data_buf_free(buf, size);
1274 ASSERT(arc_size >= size);
1275 atomic_add_64(&arc_size, -size);
1276}
1277
1278arc_buf_t *
1279arc_buf_alloc(spa_t *spa, int size, void *tag, arc_buf_contents_t type)
1280{
1281 arc_buf_hdr_t *hdr;
1282 arc_buf_t *buf;
1283
1284 ASSERT3U(size, >, 0);
1285 hdr = kmem_cache_alloc(hdr_cache, KM_PUSHPAGE);
1286 ASSERT(BUF_EMPTY(hdr));
1287 hdr->b_size = size;
1288 hdr->b_type = type;
3541dc6d 1289 hdr->b_spa = spa_load_guid(spa);
34dc7c2f
BB
1290 hdr->b_state = arc_anon;
1291 hdr->b_arc_access = 0;
1292 buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE);
1293 buf->b_hdr = hdr;
1294 buf->b_data = NULL;
1295 buf->b_efunc = NULL;
1296 buf->b_private = NULL;
1297 buf->b_next = NULL;
1298 hdr->b_buf = buf;
1299 arc_get_data_buf(buf);
1300 hdr->b_datacnt = 1;
1301 hdr->b_flags = 0;
1302 ASSERT(refcount_is_zero(&hdr->b_refcnt));
1303 (void) refcount_add(&hdr->b_refcnt, tag);
1304
1305 return (buf);
1306}
1307
9babb374
BB
1308static char *arc_onloan_tag = "onloan";
1309
1310/*
1311 * Loan out an anonymous arc buffer. Loaned buffers are not counted as in
1312 * flight data by arc_tempreserve_space() until they are "returned". Loaned
1313 * buffers must be returned to the arc before they can be used by the DMU or
1314 * freed.
1315 */
1316arc_buf_t *
1317arc_loan_buf(spa_t *spa, int size)
1318{
1319 arc_buf_t *buf;
1320
1321 buf = arc_buf_alloc(spa, size, arc_onloan_tag, ARC_BUFC_DATA);
1322
1323 atomic_add_64(&arc_loaned_bytes, size);
1324 return (buf);
1325}
1326
1327/*
1328 * Return a loaned arc buffer to the arc.
1329 */
1330void
1331arc_return_buf(arc_buf_t *buf, void *tag)
1332{
1333 arc_buf_hdr_t *hdr = buf->b_hdr;
1334
9babb374 1335 ASSERT(buf->b_data != NULL);
428870ff
BB
1336 (void) refcount_add(&hdr->b_refcnt, tag);
1337 (void) refcount_remove(&hdr->b_refcnt, arc_onloan_tag);
9babb374
BB
1338
1339 atomic_add_64(&arc_loaned_bytes, -hdr->b_size);
1340}
1341
428870ff
BB
1342/* Detach an arc_buf from a dbuf (tag) */
1343void
1344arc_loan_inuse_buf(arc_buf_t *buf, void *tag)
1345{
1346 arc_buf_hdr_t *hdr;
1347
1348 ASSERT(buf->b_data != NULL);
1349 hdr = buf->b_hdr;
1350 (void) refcount_add(&hdr->b_refcnt, arc_onloan_tag);
1351 (void) refcount_remove(&hdr->b_refcnt, tag);
1352 buf->b_efunc = NULL;
1353 buf->b_private = NULL;
1354
1355 atomic_add_64(&arc_loaned_bytes, hdr->b_size);
1356}
1357
34dc7c2f
BB
1358static arc_buf_t *
1359arc_buf_clone(arc_buf_t *from)
1360{
1361 arc_buf_t *buf;
1362 arc_buf_hdr_t *hdr = from->b_hdr;
1363 uint64_t size = hdr->b_size;
1364
428870ff
BB
1365 ASSERT(hdr->b_state != arc_anon);
1366
34dc7c2f
BB
1367 buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE);
1368 buf->b_hdr = hdr;
1369 buf->b_data = NULL;
1370 buf->b_efunc = NULL;
1371 buf->b_private = NULL;
1372 buf->b_next = hdr->b_buf;
1373 hdr->b_buf = buf;
1374 arc_get_data_buf(buf);
1375 bcopy(from->b_data, buf->b_data, size);
1376 hdr->b_datacnt += 1;
1377 return (buf);
1378}
1379
1380void
1381arc_buf_add_ref(arc_buf_t *buf, void* tag)
1382{
1383 arc_buf_hdr_t *hdr;
1384 kmutex_t *hash_lock;
1385
1386 /*
b128c09f
BB
1387 * Check to see if this buffer is evicted. Callers
1388 * must verify b_data != NULL to know if the add_ref
1389 * was successful.
34dc7c2f 1390 */
428870ff 1391 mutex_enter(&buf->b_evict_lock);
b128c09f 1392 if (buf->b_data == NULL) {
428870ff 1393 mutex_exit(&buf->b_evict_lock);
34dc7c2f
BB
1394 return;
1395 }
428870ff 1396 hash_lock = HDR_LOCK(buf->b_hdr);
34dc7c2f 1397 mutex_enter(hash_lock);
428870ff
BB
1398 hdr = buf->b_hdr;
1399 ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
1400 mutex_exit(&buf->b_evict_lock);
34dc7c2f 1401
34dc7c2f
BB
1402 ASSERT(hdr->b_state == arc_mru || hdr->b_state == arc_mfu);
1403 add_reference(hdr, hash_lock, tag);
d164b209 1404 DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr);
34dc7c2f
BB
1405 arc_access(hdr, hash_lock);
1406 mutex_exit(hash_lock);
1407 ARCSTAT_BUMP(arcstat_hits);
1408 ARCSTAT_CONDSTAT(!(hdr->b_flags & ARC_PREFETCH),
1409 demand, prefetch, hdr->b_type != ARC_BUFC_METADATA,
1410 data, metadata, hits);
1411}
1412
1413/*
1414 * Free the arc data buffer. If it is an l2arc write in progress,
1415 * the buffer is placed on l2arc_free_on_write to be freed later.
1416 */
1417static void
1418arc_buf_data_free(arc_buf_hdr_t *hdr, void (*free_func)(void *, size_t),
1419 void *data, size_t size)
1420{
1421 if (HDR_L2_WRITING(hdr)) {
1422 l2arc_data_free_t *df;
1423 df = kmem_alloc(sizeof (l2arc_data_free_t), KM_SLEEP);
1424 df->l2df_data = data;
1425 df->l2df_size = size;
1426 df->l2df_func = free_func;
1427 mutex_enter(&l2arc_free_on_write_mtx);
1428 list_insert_head(l2arc_free_on_write, df);
1429 mutex_exit(&l2arc_free_on_write_mtx);
1430 ARCSTAT_BUMP(arcstat_l2_free_on_write);
1431 } else {
1432 free_func(data, size);
1433 }
1434}
1435
1436static void
1437arc_buf_destroy(arc_buf_t *buf, boolean_t recycle, boolean_t all)
1438{
1439 arc_buf_t **bufp;
1440
1441 /* free up data associated with the buf */
1442 if (buf->b_data) {
1443 arc_state_t *state = buf->b_hdr->b_state;
1444 uint64_t size = buf->b_hdr->b_size;
1445 arc_buf_contents_t type = buf->b_hdr->b_type;
1446
1447 arc_cksum_verify(buf);
428870ff 1448
34dc7c2f
BB
1449 if (!recycle) {
1450 if (type == ARC_BUFC_METADATA) {
1451 arc_buf_data_free(buf->b_hdr, zio_buf_free,
1452 buf->b_data, size);
d164b209 1453 arc_space_return(size, ARC_SPACE_DATA);
34dc7c2f
BB
1454 } else {
1455 ASSERT(type == ARC_BUFC_DATA);
1456 arc_buf_data_free(buf->b_hdr,
1457 zio_data_buf_free, buf->b_data, size);
d164b209 1458 ARCSTAT_INCR(arcstat_data_size, -size);
34dc7c2f
BB
1459 atomic_add_64(&arc_size, -size);
1460 }
1461 }
1462 if (list_link_active(&buf->b_hdr->b_arc_node)) {
1463 uint64_t *cnt = &state->arcs_lsize[type];
1464
1465 ASSERT(refcount_is_zero(&buf->b_hdr->b_refcnt));
1466 ASSERT(state != arc_anon);
1467
1468 ASSERT3U(*cnt, >=, size);
1469 atomic_add_64(cnt, -size);
1470 }
1471 ASSERT3U(state->arcs_size, >=, size);
1472 atomic_add_64(&state->arcs_size, -size);
1473 buf->b_data = NULL;
1474 ASSERT(buf->b_hdr->b_datacnt > 0);
1475 buf->b_hdr->b_datacnt -= 1;
1476 }
1477
1478 /* only remove the buf if requested */
1479 if (!all)
1480 return;
1481
1482 /* remove the buf from the hdr list */
1483 for (bufp = &buf->b_hdr->b_buf; *bufp != buf; bufp = &(*bufp)->b_next)
1484 continue;
1485 *bufp = buf->b_next;
428870ff 1486 buf->b_next = NULL;
34dc7c2f
BB
1487
1488 ASSERT(buf->b_efunc == NULL);
1489
1490 /* clean up the buf */
1491 buf->b_hdr = NULL;
1492 kmem_cache_free(buf_cache, buf);
1493}
1494
1495static void
1496arc_hdr_destroy(arc_buf_hdr_t *hdr)
1497{
d6320ddb
BB
1498 l2arc_buf_hdr_t *l2hdr = hdr->b_l2hdr;
1499
34dc7c2f
BB
1500 ASSERT(refcount_is_zero(&hdr->b_refcnt));
1501 ASSERT3P(hdr->b_state, ==, arc_anon);
1502 ASSERT(!HDR_IO_IN_PROGRESS(hdr));
1503
428870ff
BB
1504 if (l2hdr != NULL) {
1505 boolean_t buflist_held = MUTEX_HELD(&l2arc_buflist_mtx);
1506 /*
1507 * To prevent arc_free() and l2arc_evict() from
1508 * attempting to free the same buffer at the same time,
1509 * a FREE_IN_PROGRESS flag is given to arc_free() to
1510 * give it priority. l2arc_evict() can't destroy this
1511 * header while we are waiting on l2arc_buflist_mtx.
1512 *
1513 * The hdr may be removed from l2ad_buflist before we
1514 * grab l2arc_buflist_mtx, so b_l2hdr is rechecked.
1515 */
1516 if (!buflist_held) {
34dc7c2f 1517 mutex_enter(&l2arc_buflist_mtx);
428870ff 1518 l2hdr = hdr->b_l2hdr;
34dc7c2f 1519 }
428870ff
BB
1520
1521 if (l2hdr != NULL) {
1522 list_remove(l2hdr->b_dev->l2ad_buflist, hdr);
1523 ARCSTAT_INCR(arcstat_l2_size, -hdr->b_size);
1524 kmem_free(l2hdr, sizeof (l2arc_buf_hdr_t));
1525 if (hdr->b_state == arc_l2c_only)
1526 l2arc_hdr_stat_remove();
1527 hdr->b_l2hdr = NULL;
1528 }
1529
1530 if (!buflist_held)
1531 mutex_exit(&l2arc_buflist_mtx);
34dc7c2f
BB
1532 }
1533
1534 if (!BUF_EMPTY(hdr)) {
1535 ASSERT(!HDR_IN_HASH_TABLE(hdr));
428870ff 1536 buf_discard_identity(hdr);
34dc7c2f
BB
1537 }
1538 while (hdr->b_buf) {
1539 arc_buf_t *buf = hdr->b_buf;
1540
1541 if (buf->b_efunc) {
1542 mutex_enter(&arc_eviction_mtx);
428870ff 1543 mutex_enter(&buf->b_evict_lock);
34dc7c2f
BB
1544 ASSERT(buf->b_hdr != NULL);
1545 arc_buf_destroy(hdr->b_buf, FALSE, FALSE);
1546 hdr->b_buf = buf->b_next;
1547 buf->b_hdr = &arc_eviction_hdr;
1548 buf->b_next = arc_eviction_list;
1549 arc_eviction_list = buf;
428870ff 1550 mutex_exit(&buf->b_evict_lock);
34dc7c2f
BB
1551 mutex_exit(&arc_eviction_mtx);
1552 } else {
1553 arc_buf_destroy(hdr->b_buf, FALSE, TRUE);
1554 }
1555 }
1556 if (hdr->b_freeze_cksum != NULL) {
1557 kmem_free(hdr->b_freeze_cksum, sizeof (zio_cksum_t));
1558 hdr->b_freeze_cksum = NULL;
1559 }
428870ff
BB
1560 if (hdr->b_thawed) {
1561 kmem_free(hdr->b_thawed, 1);
1562 hdr->b_thawed = NULL;
1563 }
34dc7c2f
BB
1564
1565 ASSERT(!list_link_active(&hdr->b_arc_node));
1566 ASSERT3P(hdr->b_hash_next, ==, NULL);
1567 ASSERT3P(hdr->b_acb, ==, NULL);
1568 kmem_cache_free(hdr_cache, hdr);
1569}
1570
1571void
1572arc_buf_free(arc_buf_t *buf, void *tag)
1573{
1574 arc_buf_hdr_t *hdr = buf->b_hdr;
1575 int hashed = hdr->b_state != arc_anon;
1576
1577 ASSERT(buf->b_efunc == NULL);
1578 ASSERT(buf->b_data != NULL);
1579
1580 if (hashed) {
1581 kmutex_t *hash_lock = HDR_LOCK(hdr);
1582
1583 mutex_enter(hash_lock);
428870ff
BB
1584 hdr = buf->b_hdr;
1585 ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
1586
34dc7c2f 1587 (void) remove_reference(hdr, hash_lock, tag);
428870ff 1588 if (hdr->b_datacnt > 1) {
34dc7c2f 1589 arc_buf_destroy(buf, FALSE, TRUE);
428870ff
BB
1590 } else {
1591 ASSERT(buf == hdr->b_buf);
1592 ASSERT(buf->b_efunc == NULL);
34dc7c2f 1593 hdr->b_flags |= ARC_BUF_AVAILABLE;
428870ff 1594 }
34dc7c2f
BB
1595 mutex_exit(hash_lock);
1596 } else if (HDR_IO_IN_PROGRESS(hdr)) {
1597 int destroy_hdr;
1598 /*
1599 * We are in the middle of an async write. Don't destroy
1600 * this buffer unless the write completes before we finish
1601 * decrementing the reference count.
1602 */
1603 mutex_enter(&arc_eviction_mtx);
1604 (void) remove_reference(hdr, NULL, tag);
1605 ASSERT(refcount_is_zero(&hdr->b_refcnt));
1606 destroy_hdr = !HDR_IO_IN_PROGRESS(hdr);
1607 mutex_exit(&arc_eviction_mtx);
1608 if (destroy_hdr)
1609 arc_hdr_destroy(hdr);
1610 } else {
428870ff 1611 if (remove_reference(hdr, NULL, tag) > 0)
34dc7c2f 1612 arc_buf_destroy(buf, FALSE, TRUE);
428870ff 1613 else
34dc7c2f 1614 arc_hdr_destroy(hdr);
34dc7c2f
BB
1615 }
1616}
1617
1618int
1619arc_buf_remove_ref(arc_buf_t *buf, void* tag)
1620{
1621 arc_buf_hdr_t *hdr = buf->b_hdr;
1622 kmutex_t *hash_lock = HDR_LOCK(hdr);
1623 int no_callback = (buf->b_efunc == NULL);
1624
1625 if (hdr->b_state == arc_anon) {
428870ff 1626 ASSERT(hdr->b_datacnt == 1);
34dc7c2f
BB
1627 arc_buf_free(buf, tag);
1628 return (no_callback);
1629 }
1630
1631 mutex_enter(hash_lock);
428870ff
BB
1632 hdr = buf->b_hdr;
1633 ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
34dc7c2f
BB
1634 ASSERT(hdr->b_state != arc_anon);
1635 ASSERT(buf->b_data != NULL);
1636
1637 (void) remove_reference(hdr, hash_lock, tag);
1638 if (hdr->b_datacnt > 1) {
1639 if (no_callback)
1640 arc_buf_destroy(buf, FALSE, TRUE);
1641 } else if (no_callback) {
1642 ASSERT(hdr->b_buf == buf && buf->b_next == NULL);
428870ff 1643 ASSERT(buf->b_efunc == NULL);
34dc7c2f
BB
1644 hdr->b_flags |= ARC_BUF_AVAILABLE;
1645 }
1646 ASSERT(no_callback || hdr->b_datacnt > 1 ||
1647 refcount_is_zero(&hdr->b_refcnt));
1648 mutex_exit(hash_lock);
1649 return (no_callback);
1650}
1651
1652int
1653arc_buf_size(arc_buf_t *buf)
1654{
1655 return (buf->b_hdr->b_size);
1656}
1657
1658/*
1659 * Evict buffers from list until we've removed the specified number of
1660 * bytes. Move the removed buffers to the appropriate evict state.
1661 * If the recycle flag is set, then attempt to "recycle" a buffer:
1662 * - look for a buffer to evict that is `bytes' long.
1663 * - return the data block from this buffer rather than freeing it.
1664 * This flag is used by callers that are trying to make space for a
1665 * new buffer in a full arc cache.
1666 *
1667 * This function makes a "best effort". It skips over any buffers
1668 * it can't get a hash_lock on, and so may not catch all candidates.
1669 * It may also return without evicting as much space as requested.
1670 */
1671static void *
d164b209 1672arc_evict(arc_state_t *state, uint64_t spa, int64_t bytes, boolean_t recycle,
34dc7c2f
BB
1673 arc_buf_contents_t type)
1674{
1675 arc_state_t *evicted_state;
1676 uint64_t bytes_evicted = 0, skipped = 0, missed = 0;
1677 arc_buf_hdr_t *ab, *ab_prev = NULL;
1678 list_t *list = &state->arcs_list[type];
1679 kmutex_t *hash_lock;
1680 boolean_t have_lock;
1681 void *stolen = NULL;
1682
1683 ASSERT(state == arc_mru || state == arc_mfu);
1684
1685 evicted_state = (state == arc_mru) ? arc_mru_ghost : arc_mfu_ghost;
1686
1687 mutex_enter(&state->arcs_mtx);
1688 mutex_enter(&evicted_state->arcs_mtx);
1689
1690 for (ab = list_tail(list); ab; ab = ab_prev) {
1691 ab_prev = list_prev(list, ab);
1692 /* prefetch buffers have a minimum lifespan */
1693 if (HDR_IO_IN_PROGRESS(ab) ||
1694 (spa && ab->b_spa != spa) ||
1695 (ab->b_flags & (ARC_PREFETCH|ARC_INDIRECT) &&
428870ff
BB
1696 ddi_get_lbolt() - ab->b_arc_access <
1697 arc_min_prefetch_lifespan)) {
34dc7c2f
BB
1698 skipped++;
1699 continue;
1700 }
1701 /* "lookahead" for better eviction candidate */
1702 if (recycle && ab->b_size != bytes &&
1703 ab_prev && ab_prev->b_size == bytes)
1704 continue;
1705 hash_lock = HDR_LOCK(ab);
1706 have_lock = MUTEX_HELD(hash_lock);
1707 if (have_lock || mutex_tryenter(hash_lock)) {
1708 ASSERT3U(refcount_count(&ab->b_refcnt), ==, 0);
1709 ASSERT(ab->b_datacnt > 0);
1710 while (ab->b_buf) {
1711 arc_buf_t *buf = ab->b_buf;
428870ff 1712 if (!mutex_tryenter(&buf->b_evict_lock)) {
b128c09f
BB
1713 missed += 1;
1714 break;
1715 }
34dc7c2f
BB
1716 if (buf->b_data) {
1717 bytes_evicted += ab->b_size;
1718 if (recycle && ab->b_type == type &&
1719 ab->b_size == bytes &&
1720 !HDR_L2_WRITING(ab)) {
1721 stolen = buf->b_data;
1722 recycle = FALSE;
1723 }
1724 }
1725 if (buf->b_efunc) {
1726 mutex_enter(&arc_eviction_mtx);
1727 arc_buf_destroy(buf,
1728 buf->b_data == stolen, FALSE);
1729 ab->b_buf = buf->b_next;
1730 buf->b_hdr = &arc_eviction_hdr;
1731 buf->b_next = arc_eviction_list;
1732 arc_eviction_list = buf;
1733 mutex_exit(&arc_eviction_mtx);
428870ff 1734 mutex_exit(&buf->b_evict_lock);
34dc7c2f 1735 } else {
428870ff 1736 mutex_exit(&buf->b_evict_lock);
34dc7c2f
BB
1737 arc_buf_destroy(buf,
1738 buf->b_data == stolen, TRUE);
1739 }
1740 }
428870ff
BB
1741
1742 if (ab->b_l2hdr) {
1743 ARCSTAT_INCR(arcstat_evict_l2_cached,
1744 ab->b_size);
1745 } else {
1746 if (l2arc_write_eligible(ab->b_spa, ab)) {
1747 ARCSTAT_INCR(arcstat_evict_l2_eligible,
1748 ab->b_size);
1749 } else {
1750 ARCSTAT_INCR(
1751 arcstat_evict_l2_ineligible,
1752 ab->b_size);
1753 }
1754 }
1755
b128c09f
BB
1756 if (ab->b_datacnt == 0) {
1757 arc_change_state(evicted_state, ab, hash_lock);
1758 ASSERT(HDR_IN_HASH_TABLE(ab));
1759 ab->b_flags |= ARC_IN_HASH_TABLE;
1760 ab->b_flags &= ~ARC_BUF_AVAILABLE;
1761 DTRACE_PROBE1(arc__evict, arc_buf_hdr_t *, ab);
1762 }
34dc7c2f
BB
1763 if (!have_lock)
1764 mutex_exit(hash_lock);
1765 if (bytes >= 0 && bytes_evicted >= bytes)
1766 break;
1767 } else {
1768 missed += 1;
1769 }
1770 }
1771
1772 mutex_exit(&evicted_state->arcs_mtx);
1773 mutex_exit(&state->arcs_mtx);
1774
1775 if (bytes_evicted < bytes)
3f504482 1776 dprintf("only evicted %lld bytes from %x\n",
34dc7c2f
BB
1777 (longlong_t)bytes_evicted, state);
1778
1779 if (skipped)
1780 ARCSTAT_INCR(arcstat_evict_skip, skipped);
1781
1782 if (missed)
1783 ARCSTAT_INCR(arcstat_mutex_miss, missed);
1784
1785 /*
1786 * We have just evicted some date into the ghost state, make
1787 * sure we also adjust the ghost state size if necessary.
1788 */
1789 if (arc_no_grow &&
1790 arc_mru_ghost->arcs_size + arc_mfu_ghost->arcs_size > arc_c) {
1791 int64_t mru_over = arc_anon->arcs_size + arc_mru->arcs_size +
1792 arc_mru_ghost->arcs_size - arc_c;
1793
1794 if (mru_over > 0 && arc_mru_ghost->arcs_lsize[type] > 0) {
1795 int64_t todelete =
1796 MIN(arc_mru_ghost->arcs_lsize[type], mru_over);
b8864a23 1797 arc_evict_ghost(arc_mru_ghost, 0, todelete);
34dc7c2f
BB
1798 } else if (arc_mfu_ghost->arcs_lsize[type] > 0) {
1799 int64_t todelete = MIN(arc_mfu_ghost->arcs_lsize[type],
1800 arc_mru_ghost->arcs_size +
1801 arc_mfu_ghost->arcs_size - arc_c);
b8864a23 1802 arc_evict_ghost(arc_mfu_ghost, 0, todelete);
34dc7c2f
BB
1803 }
1804 }
1805
1806 return (stolen);
1807}
1808
1809/*
1810 * Remove buffers from list until we've removed the specified number of
1811 * bytes. Destroy the buffers that are removed.
1812 */
1813static void
d164b209 1814arc_evict_ghost(arc_state_t *state, uint64_t spa, int64_t bytes)
34dc7c2f
BB
1815{
1816 arc_buf_hdr_t *ab, *ab_prev;
2598c001 1817 arc_buf_hdr_t marker;
34dc7c2f
BB
1818 list_t *list = &state->arcs_list[ARC_BUFC_DATA];
1819 kmutex_t *hash_lock;
1820 uint64_t bytes_deleted = 0;
1821 uint64_t bufs_skipped = 0;
1822
1823 ASSERT(GHOST_STATE(state));
2598c001 1824 bzero(&marker, sizeof(marker));
34dc7c2f
BB
1825top:
1826 mutex_enter(&state->arcs_mtx);
1827 for (ab = list_tail(list); ab; ab = ab_prev) {
1828 ab_prev = list_prev(list, ab);
1829 if (spa && ab->b_spa != spa)
1830 continue;
572e2857
BB
1831
1832 /* ignore markers */
1833 if (ab->b_spa == 0)
1834 continue;
1835
34dc7c2f 1836 hash_lock = HDR_LOCK(ab);
428870ff
BB
1837 /* caller may be trying to modify this buffer, skip it */
1838 if (MUTEX_HELD(hash_lock))
1839 continue;
34dc7c2f
BB
1840 if (mutex_tryenter(hash_lock)) {
1841 ASSERT(!HDR_IO_IN_PROGRESS(ab));
1842 ASSERT(ab->b_buf == NULL);
1843 ARCSTAT_BUMP(arcstat_deleted);
1844 bytes_deleted += ab->b_size;
1845
1846 if (ab->b_l2hdr != NULL) {
1847 /*
1848 * This buffer is cached on the 2nd Level ARC;
1849 * don't destroy the header.
1850 */
1851 arc_change_state(arc_l2c_only, ab, hash_lock);
1852 mutex_exit(hash_lock);
1853 } else {
1854 arc_change_state(arc_anon, ab, hash_lock);
1855 mutex_exit(hash_lock);
1856 arc_hdr_destroy(ab);
1857 }
1858
1859 DTRACE_PROBE1(arc__delete, arc_buf_hdr_t *, ab);
1860 if (bytes >= 0 && bytes_deleted >= bytes)
1861 break;
572e2857
BB
1862 } else if (bytes < 0) {
1863 /*
1864 * Insert a list marker and then wait for the
1865 * hash lock to become available. Once its
1866 * available, restart from where we left off.
1867 */
1868 list_insert_after(list, ab, &marker);
1869 mutex_exit(&state->arcs_mtx);
1870 mutex_enter(hash_lock);
1871 mutex_exit(hash_lock);
1872 mutex_enter(&state->arcs_mtx);
1873 ab_prev = list_prev(list, &marker);
1874 list_remove(list, &marker);
1875 } else
34dc7c2f 1876 bufs_skipped += 1;
34dc7c2f
BB
1877 }
1878 mutex_exit(&state->arcs_mtx);
1879
1880 if (list == &state->arcs_list[ARC_BUFC_DATA] &&
1881 (bytes < 0 || bytes_deleted < bytes)) {
1882 list = &state->arcs_list[ARC_BUFC_METADATA];
1883 goto top;
1884 }
1885
1886 if (bufs_skipped) {
1887 ARCSTAT_INCR(arcstat_mutex_miss, bufs_skipped);
1888 ASSERT(bytes >= 0);
1889 }
1890
1891 if (bytes_deleted < bytes)
3f504482 1892 dprintf("only deleted %lld bytes from %p\n",
34dc7c2f
BB
1893 (longlong_t)bytes_deleted, state);
1894}
1895
1896static void
1897arc_adjust(void)
1898{
d164b209
BB
1899 int64_t adjustment, delta;
1900
1901 /*
1902 * Adjust MRU size
1903 */
34dc7c2f 1904
572e2857
BB
1905 adjustment = MIN((int64_t)(arc_size - arc_c),
1906 (int64_t)(arc_anon->arcs_size + arc_mru->arcs_size + arc_meta_used -
1907 arc_p));
34dc7c2f 1908
d164b209
BB
1909 if (adjustment > 0 && arc_mru->arcs_lsize[ARC_BUFC_DATA] > 0) {
1910 delta = MIN(arc_mru->arcs_lsize[ARC_BUFC_DATA], adjustment);
b8864a23 1911 (void) arc_evict(arc_mru, 0, delta, FALSE, ARC_BUFC_DATA);
d164b209 1912 adjustment -= delta;
34dc7c2f
BB
1913 }
1914
d164b209
BB
1915 if (adjustment > 0 && arc_mru->arcs_lsize[ARC_BUFC_METADATA] > 0) {
1916 delta = MIN(arc_mru->arcs_lsize[ARC_BUFC_METADATA], adjustment);
b8864a23 1917 (void) arc_evict(arc_mru, 0, delta, FALSE,
34dc7c2f 1918 ARC_BUFC_METADATA);
34dc7c2f
BB
1919 }
1920
d164b209
BB
1921 /*
1922 * Adjust MFU size
1923 */
34dc7c2f 1924
d164b209
BB
1925 adjustment = arc_size - arc_c;
1926
1927 if (adjustment > 0 && arc_mfu->arcs_lsize[ARC_BUFC_DATA] > 0) {
1928 delta = MIN(adjustment, arc_mfu->arcs_lsize[ARC_BUFC_DATA]);
b8864a23 1929 (void) arc_evict(arc_mfu, 0, delta, FALSE, ARC_BUFC_DATA);
d164b209 1930 adjustment -= delta;
34dc7c2f
BB
1931 }
1932
d164b209
BB
1933 if (adjustment > 0 && arc_mfu->arcs_lsize[ARC_BUFC_METADATA] > 0) {
1934 int64_t delta = MIN(adjustment,
1935 arc_mfu->arcs_lsize[ARC_BUFC_METADATA]);
b8864a23 1936 (void) arc_evict(arc_mfu, 0, delta, FALSE,
d164b209
BB
1937 ARC_BUFC_METADATA);
1938 }
34dc7c2f 1939
d164b209
BB
1940 /*
1941 * Adjust ghost lists
1942 */
34dc7c2f 1943
d164b209
BB
1944 adjustment = arc_mru->arcs_size + arc_mru_ghost->arcs_size - arc_c;
1945
1946 if (adjustment > 0 && arc_mru_ghost->arcs_size > 0) {
1947 delta = MIN(arc_mru_ghost->arcs_size, adjustment);
b8864a23 1948 arc_evict_ghost(arc_mru_ghost, 0, delta);
d164b209 1949 }
34dc7c2f 1950
d164b209
BB
1951 adjustment =
1952 arc_mru_ghost->arcs_size + arc_mfu_ghost->arcs_size - arc_c;
34dc7c2f 1953
d164b209
BB
1954 if (adjustment > 0 && arc_mfu_ghost->arcs_size > 0) {
1955 delta = MIN(arc_mfu_ghost->arcs_size, adjustment);
b8864a23 1956 arc_evict_ghost(arc_mfu_ghost, 0, delta);
34dc7c2f
BB
1957 }
1958}
1959
ab26409d
BB
1960/*
1961 * Request that arc user drop references so that N bytes can be released
1962 * from the cache. This provides a mechanism to ensure the arc can honor
1963 * the arc_meta_limit and reclaim buffers which are pinned in the cache
1964 * by higher layers. (i.e. the zpl)
1965 */
1966static void
1967arc_do_user_prune(int64_t adjustment)
1968{
1969 arc_prune_func_t *func;
1970 void *private;
1971 arc_prune_t *cp, *np;
1972
1973 mutex_enter(&arc_prune_mtx);
1974
1975 cp = list_head(&arc_prune_list);
1976 while (cp != NULL) {
1977 func = cp->p_pfunc;
1978 private = cp->p_private;
1979 np = list_next(&arc_prune_list, cp);
1980 refcount_add(&cp->p_refcnt, func);
1981 mutex_exit(&arc_prune_mtx);
1982
1983 if (func != NULL)
1984 func(adjustment, private);
1985
1986 mutex_enter(&arc_prune_mtx);
1987
1988 /* User removed prune callback concurrently with execution */
1989 if (refcount_remove(&cp->p_refcnt, func) == 0) {
1990 ASSERT(!list_link_active(&cp->p_node));
1991 refcount_destroy(&cp->p_refcnt);
1992 kmem_free(cp, sizeof (*cp));
1993 }
1994
1995 cp = np;
1996 }
1997
1998 ARCSTAT_BUMP(arcstat_prune);
1999 mutex_exit(&arc_prune_mtx);
2000}
2001
34dc7c2f
BB
2002static void
2003arc_do_user_evicts(void)
2004{
2005 mutex_enter(&arc_eviction_mtx);
2006 while (arc_eviction_list != NULL) {
2007 arc_buf_t *buf = arc_eviction_list;
2008 arc_eviction_list = buf->b_next;
428870ff 2009 mutex_enter(&buf->b_evict_lock);
34dc7c2f 2010 buf->b_hdr = NULL;
428870ff 2011 mutex_exit(&buf->b_evict_lock);
34dc7c2f
BB
2012 mutex_exit(&arc_eviction_mtx);
2013
2014 if (buf->b_efunc != NULL)
2015 VERIFY(buf->b_efunc(buf) == 0);
2016
2017 buf->b_efunc = NULL;
2018 buf->b_private = NULL;
2019 kmem_cache_free(buf_cache, buf);
2020 mutex_enter(&arc_eviction_mtx);
2021 }
2022 mutex_exit(&arc_eviction_mtx);
2023}
2024
ab26409d
BB
2025/*
2026 * Evict only meta data objects from the cache leaving the data objects.
2027 * This is only used to enforce the tunable arc_meta_limit, if we are
2028 * unable to evict enough buffers notify the user via the prune callback.
2029 */
2030void
2031arc_adjust_meta(int64_t adjustment, boolean_t may_prune)
2032{
2033 int64_t delta;
2034
2035 if (adjustment > 0 && arc_mru->arcs_lsize[ARC_BUFC_METADATA] > 0) {
2036 delta = MIN(arc_mru->arcs_lsize[ARC_BUFC_METADATA], adjustment);
2037 arc_evict(arc_mru, 0, delta, FALSE, ARC_BUFC_METADATA);
2038 adjustment -= delta;
2039 }
2040
2041 if (adjustment > 0 && arc_mfu->arcs_lsize[ARC_BUFC_METADATA] > 0) {
2042 delta = MIN(arc_mfu->arcs_lsize[ARC_BUFC_METADATA], adjustment);
2043 arc_evict(arc_mfu, 0, delta, FALSE, ARC_BUFC_METADATA);
2044 adjustment -= delta;
2045 }
2046
2047 if (may_prune && (adjustment > 0) && (arc_meta_used > arc_meta_limit))
2048 arc_do_user_prune(arc_meta_prune);
2049}
2050
34dc7c2f
BB
2051/*
2052 * Flush all *evictable* data from the cache for the given spa.
2053 * NOTE: this will not touch "active" (i.e. referenced) data.
2054 */
2055void
2056arc_flush(spa_t *spa)
2057{
d164b209
BB
2058 uint64_t guid = 0;
2059
2060 if (spa)
3541dc6d 2061 guid = spa_load_guid(spa);
d164b209 2062
34dc7c2f 2063 while (list_head(&arc_mru->arcs_list[ARC_BUFC_DATA])) {
d164b209 2064 (void) arc_evict(arc_mru, guid, -1, FALSE, ARC_BUFC_DATA);
34dc7c2f
BB
2065 if (spa)
2066 break;
2067 }
2068 while (list_head(&arc_mru->arcs_list[ARC_BUFC_METADATA])) {
d164b209 2069 (void) arc_evict(arc_mru, guid, -1, FALSE, ARC_BUFC_METADATA);
34dc7c2f
BB
2070 if (spa)
2071 break;
2072 }
2073 while (list_head(&arc_mfu->arcs_list[ARC_BUFC_DATA])) {
d164b209 2074 (void) arc_evict(arc_mfu, guid, -1, FALSE, ARC_BUFC_DATA);
34dc7c2f
BB
2075 if (spa)
2076 break;
2077 }
2078 while (list_head(&arc_mfu->arcs_list[ARC_BUFC_METADATA])) {
d164b209 2079 (void) arc_evict(arc_mfu, guid, -1, FALSE, ARC_BUFC_METADATA);
34dc7c2f
BB
2080 if (spa)
2081 break;
2082 }
2083
d164b209
BB
2084 arc_evict_ghost(arc_mru_ghost, guid, -1);
2085 arc_evict_ghost(arc_mfu_ghost, guid, -1);
34dc7c2f
BB
2086
2087 mutex_enter(&arc_reclaim_thr_lock);
2088 arc_do_user_evicts();
2089 mutex_exit(&arc_reclaim_thr_lock);
2090 ASSERT(spa || arc_eviction_list == NULL);
2091}
2092
34dc7c2f 2093void
302f753f 2094arc_shrink(uint64_t bytes)
34dc7c2f
BB
2095{
2096 if (arc_c > arc_c_min) {
2097 uint64_t to_free;
2098
302f753f
BB
2099 to_free = bytes ? bytes : arc_c >> arc_shrink_shift;
2100
34dc7c2f
BB
2101 if (arc_c > arc_c_min + to_free)
2102 atomic_add_64(&arc_c, -to_free);
2103 else
2104 arc_c = arc_c_min;
2105
2106 atomic_add_64(&arc_p, -(arc_p >> arc_shrink_shift));
2107 if (arc_c > arc_size)
2108 arc_c = MAX(arc_size, arc_c_min);
2109 if (arc_p > arc_c)
2110 arc_p = (arc_c >> 1);
2111 ASSERT(arc_c >= arc_c_min);
2112 ASSERT((int64_t)arc_p >= 0);
2113 }
2114
2115 if (arc_size > arc_c)
2116 arc_adjust();
2117}
2118
34dc7c2f 2119static void
302f753f 2120arc_kmem_reap_now(arc_reclaim_strategy_t strat, uint64_t bytes)
34dc7c2f
BB
2121{
2122 size_t i;
2123 kmem_cache_t *prev_cache = NULL;
2124 kmem_cache_t *prev_data_cache = NULL;
2125 extern kmem_cache_t *zio_buf_cache[];
2126 extern kmem_cache_t *zio_data_buf_cache[];
34dc7c2f
BB
2127
2128 /*
2129 * An aggressive reclamation will shrink the cache size as well as
2130 * reap free buffers from the arc kmem caches.
2131 */
2132 if (strat == ARC_RECLAIM_AGGR)
302f753f 2133 arc_shrink(bytes);
34dc7c2f
BB
2134
2135 for (i = 0; i < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; i++) {
2136 if (zio_buf_cache[i] != prev_cache) {
2137 prev_cache = zio_buf_cache[i];
2138 kmem_cache_reap_now(zio_buf_cache[i]);
2139 }
2140 if (zio_data_buf_cache[i] != prev_data_cache) {
2141 prev_data_cache = zio_data_buf_cache[i];
2142 kmem_cache_reap_now(zio_data_buf_cache[i]);
2143 }
2144 }
ab26409d 2145
34dc7c2f
BB
2146 kmem_cache_reap_now(buf_cache);
2147 kmem_cache_reap_now(hdr_cache);
2148}
2149
302f753f
BB
2150/*
2151 * Unlike other ZFS implementations this thread is only responsible for
2152 * adapting the target ARC size on Linux. The responsibility for memory
2153 * reclamation has been entirely delegated to the arc_shrinker_func()
2154 * which is registered with the VM. To reflect this change in behavior
2155 * the arc_reclaim thread has been renamed to arc_adapt.
2156 */
34dc7c2f 2157static void
302f753f 2158arc_adapt_thread(void)
34dc7c2f 2159{
34dc7c2f 2160 callb_cpr_t cpr;
ab26409d 2161 int64_t prune;
34dc7c2f
BB
2162
2163 CALLB_CPR_INIT(&cpr, &arc_reclaim_thr_lock, callb_generic_cpr, FTAG);
2164
2165 mutex_enter(&arc_reclaim_thr_lock);
2166 while (arc_thread_exit == 0) {
302f753f
BB
2167#ifndef _KERNEL
2168 arc_reclaim_strategy_t last_reclaim = ARC_RECLAIM_CONS;
2169
2170 if (spa_get_random(100) == 0) {
34dc7c2f
BB
2171
2172 if (arc_no_grow) {
2173 if (last_reclaim == ARC_RECLAIM_CONS) {
2174 last_reclaim = ARC_RECLAIM_AGGR;
2175 } else {
2176 last_reclaim = ARC_RECLAIM_CONS;
2177 }
2178 } else {
2179 arc_no_grow = TRUE;
2180 last_reclaim = ARC_RECLAIM_AGGR;
2181 membar_producer();
2182 }
2183
2184 /* reset the growth delay for every reclaim */
302f753f 2185 arc_grow_time = ddi_get_lbolt()+(arc_grow_retry * hz);
34dc7c2f 2186
302f753f 2187 arc_kmem_reap_now(last_reclaim, 0);
b128c09f 2188 arc_warm = B_TRUE;
302f753f
BB
2189 }
2190#endif /* !_KERNEL */
34dc7c2f 2191
302f753f
BB
2192 /* No recent memory pressure allow the ARC to grow. */
2193 if (arc_no_grow && ddi_get_lbolt() >= arc_grow_time)
34dc7c2f 2194 arc_no_grow = FALSE;
34dc7c2f 2195
ab26409d
BB
2196 /*
2197 * Keep meta data usage within limits, arc_shrink() is not
2198 * used to avoid collapsing the arc_c value when only the
2199 * arc_meta_limit is being exceeded.
2200 */
2201 prune = (int64_t)arc_meta_used - (int64_t)arc_meta_limit;
2202 if (prune > 0)
2203 arc_adjust_meta(prune, B_TRUE);
6a8f9b6b 2204
572e2857 2205 arc_adjust();
34dc7c2f
BB
2206
2207 if (arc_eviction_list != NULL)
2208 arc_do_user_evicts();
2209
2210 /* block until needed, or one second, whichever is shorter */
2211 CALLB_CPR_SAFE_BEGIN(&cpr);
5b63b3eb 2212 (void) cv_timedwait_interruptible(&arc_reclaim_thr_cv,
428870ff 2213 &arc_reclaim_thr_lock, (ddi_get_lbolt() + hz));
34dc7c2f
BB
2214 CALLB_CPR_SAFE_END(&cpr, &arc_reclaim_thr_lock);
2215 }
2216
2217 arc_thread_exit = 0;
2218 cv_broadcast(&arc_reclaim_thr_cv);
2219 CALLB_CPR_EXIT(&cpr); /* drops arc_reclaim_thr_lock */
2220 thread_exit();
2221}
2222
7cb67b45
BB
2223#ifdef _KERNEL
2224/*
302f753f
BB
2225 * Determine the amount of memory eligible for eviction contained in the
2226 * ARC. All clean data reported by the ghost lists can always be safely
2227 * evicted. Due to arc_c_min, the same does not hold for all clean data
2228 * contained by the regular mru and mfu lists.
2229 *
2230 * In the case of the regular mru and mfu lists, we need to report as
2231 * much clean data as possible, such that evicting that same reported
2232 * data will not bring arc_size below arc_c_min. Thus, in certain
2233 * circumstances, the total amount of clean data in the mru and mfu
2234 * lists might not actually be evictable.
2235 *
2236 * The following two distinct cases are accounted for:
2237 *
2238 * 1. The sum of the amount of dirty data contained by both the mru and
2239 * mfu lists, plus the ARC's other accounting (e.g. the anon list),
2240 * is greater than or equal to arc_c_min.
2241 * (i.e. amount of dirty data >= arc_c_min)
2242 *
2243 * This is the easy case; all clean data contained by the mru and mfu
2244 * lists is evictable. Evicting all clean data can only drop arc_size
2245 * to the amount of dirty data, which is greater than arc_c_min.
2246 *
2247 * 2. The sum of the amount of dirty data contained by both the mru and
2248 * mfu lists, plus the ARC's other accounting (e.g. the anon list),
2249 * is less than arc_c_min.
2250 * (i.e. arc_c_min > amount of dirty data)
2251 *
2252 * 2.1. arc_size is greater than or equal arc_c_min.
2253 * (i.e. arc_size >= arc_c_min > amount of dirty data)
2254 *
2255 * In this case, not all clean data from the regular mru and mfu
2256 * lists is actually evictable; we must leave enough clean data
2257 * to keep arc_size above arc_c_min. Thus, the maximum amount of
2258 * evictable data from the two lists combined, is exactly the
2259 * difference between arc_size and arc_c_min.
2260 *
2261 * 2.2. arc_size is less than arc_c_min
2262 * (i.e. arc_c_min > arc_size > amount of dirty data)
2263 *
2264 * In this case, none of the data contained in the mru and mfu
2265 * lists is evictable, even if it's clean. Since arc_size is
2266 * already below arc_c_min, evicting any more would only
2267 * increase this negative difference.
7cb67b45 2268 */
302f753f
BB
2269static uint64_t
2270arc_evictable_memory(void) {
2271 uint64_t arc_clean =
2272 arc_mru->arcs_lsize[ARC_BUFC_DATA] +
2273 arc_mru->arcs_lsize[ARC_BUFC_METADATA] +
2274 arc_mfu->arcs_lsize[ARC_BUFC_DATA] +
2275 arc_mfu->arcs_lsize[ARC_BUFC_METADATA];
2276 uint64_t ghost_clean =
2277 arc_mru_ghost->arcs_lsize[ARC_BUFC_DATA] +
2278 arc_mru_ghost->arcs_lsize[ARC_BUFC_METADATA] +
2279 arc_mfu_ghost->arcs_lsize[ARC_BUFC_DATA] +
2280 arc_mfu_ghost->arcs_lsize[ARC_BUFC_METADATA];
2281 uint64_t arc_dirty = MAX((int64_t)arc_size - (int64_t)arc_clean, 0);
2282
2283 if (arc_dirty >= arc_c_min)
2284 return (ghost_clean + arc_clean);
2285
2286 return (ghost_clean + MAX((int64_t)arc_size - (int64_t)arc_c_min, 0));
2287}
2288
7e7baeca
BB
2289static int
2290__arc_shrinker_func(struct shrinker *shrink, struct shrink_control *sc)
7cb67b45 2291{
302f753f 2292 uint64_t pages;
7cb67b45 2293
302f753f
BB
2294 /* The arc is considered warm once reclaim has occurred */
2295 if (unlikely(arc_warm == B_FALSE))
2296 arc_warm = B_TRUE;
7cb67b45 2297
302f753f
BB
2298 /* Return the potential number of reclaimable pages */
2299 pages = btop(arc_evictable_memory());
2300 if (sc->nr_to_scan == 0)
2301 return (pages);
3fd70ee6
BB
2302
2303 /* Not allowed to perform filesystem reclaim */
7e7baeca 2304 if (!(sc->gfp_mask & __GFP_FS))
3fd70ee6
BB
2305 return (-1);
2306
7cb67b45
BB
2307 /* Reclaim in progress */
2308 if (mutex_tryenter(&arc_reclaim_thr_lock) == 0)
2309 return (-1);
2310
302f753f
BB
2311 /*
2312 * Evict the requested number of pages by shrinking arc_c the
2313 * requested amount. If there is nothing left to evict just
2314 * reap whatever we can from the various arc slabs.
2315 */
2316 if (pages > 0) {
2317 arc_kmem_reap_now(ARC_RECLAIM_AGGR, ptob(sc->nr_to_scan));
2318 pages = btop(arc_evictable_memory());
2319 } else {
2320 arc_kmem_reap_now(ARC_RECLAIM_CONS, ptob(sc->nr_to_scan));
2321 pages = -1;
2322 }
2323
2324 /*
2325 * When direct reclaim is observed it usually indicates a rapid
2326 * increase in memory pressure. This occurs because the kswapd
2327 * threads were unable to asynchronously keep enough free memory
2328 * available. In this case set arc_no_grow to briefly pause arc
2329 * growth to avoid compounding the memory pressure.
2330 */
7cb67b45 2331 if (current_is_kswapd()) {
302f753f 2332 ARCSTAT_BUMP(arcstat_memory_indirect_count);
7cb67b45 2333 } else {
302f753f
BB
2334 arc_no_grow = B_TRUE;
2335 arc_grow_time = ddi_get_lbolt() + (arc_grow_retry * hz);
2336 ARCSTAT_BUMP(arcstat_memory_direct_count);
7cb67b45
BB
2337 }
2338
7cb67b45
BB
2339 mutex_exit(&arc_reclaim_thr_lock);
2340
302f753f 2341 return (pages);
7cb67b45 2342}
7e7baeca 2343SPL_SHRINKER_CALLBACK_WRAPPER(arc_shrinker_func);
7cb67b45
BB
2344
2345SPL_SHRINKER_DECLARE(arc_shrinker, arc_shrinker_func, DEFAULT_SEEKS);
2346#endif /* _KERNEL */
2347
34dc7c2f
BB
2348/*
2349 * Adapt arc info given the number of bytes we are trying to add and
2350 * the state that we are comming from. This function is only called
2351 * when we are adding new content to the cache.
2352 */
2353static void
2354arc_adapt(int bytes, arc_state_t *state)
2355{
2356 int mult;
d164b209 2357 uint64_t arc_p_min = (arc_c >> arc_p_min_shift);
34dc7c2f
BB
2358
2359 if (state == arc_l2c_only)
2360 return;
2361
2362 ASSERT(bytes > 0);
2363 /*
2364 * Adapt the target size of the MRU list:
2365 * - if we just hit in the MRU ghost list, then increase
2366 * the target size of the MRU list.
2367 * - if we just hit in the MFU ghost list, then increase
2368 * the target size of the MFU list by decreasing the
2369 * target size of the MRU list.
2370 */
2371 if (state == arc_mru_ghost) {
2372 mult = ((arc_mru_ghost->arcs_size >= arc_mfu_ghost->arcs_size) ?
2373 1 : (arc_mfu_ghost->arcs_size/arc_mru_ghost->arcs_size));
572e2857 2374 mult = MIN(mult, 10); /* avoid wild arc_p adjustment */
34dc7c2f 2375
d164b209 2376 arc_p = MIN(arc_c - arc_p_min, arc_p + bytes * mult);
34dc7c2f 2377 } else if (state == arc_mfu_ghost) {
d164b209
BB
2378 uint64_t delta;
2379
34dc7c2f
BB
2380 mult = ((arc_mfu_ghost->arcs_size >= arc_mru_ghost->arcs_size) ?
2381 1 : (arc_mru_ghost->arcs_size/arc_mfu_ghost->arcs_size));
572e2857 2382 mult = MIN(mult, 10);
34dc7c2f 2383
d164b209
BB
2384 delta = MIN(bytes * mult, arc_p);
2385 arc_p = MAX(arc_p_min, arc_p - delta);
34dc7c2f
BB
2386 }
2387 ASSERT((int64_t)arc_p >= 0);
2388
34dc7c2f
BB
2389 if (arc_no_grow)
2390 return;
2391
2392 if (arc_c >= arc_c_max)
2393 return;
2394
2395 /*
2396 * If we're within (2 * maxblocksize) bytes of the target
2397 * cache size, increment the target cache size
2398 */
2399 if (arc_size > arc_c - (2ULL << SPA_MAXBLOCKSHIFT)) {
2400 atomic_add_64(&arc_c, (int64_t)bytes);
2401 if (arc_c > arc_c_max)
2402 arc_c = arc_c_max;
2403 else if (state == arc_anon)
2404 atomic_add_64(&arc_p, (int64_t)bytes);
2405 if (arc_p > arc_c)
2406 arc_p = arc_c;
2407 }
2408 ASSERT((int64_t)arc_p >= 0);
2409}
2410
2411/*
2412 * Check if the cache has reached its limits and eviction is required
2413 * prior to insert.
2414 */
2415static int
2416arc_evict_needed(arc_buf_contents_t type)
2417{
2418 if (type == ARC_BUFC_METADATA && arc_meta_used >= arc_meta_limit)
2419 return (1);
2420
2421#ifdef _KERNEL
2422 /*
2423 * If zio data pages are being allocated out of a separate heap segment,
2424 * then enforce that the size of available vmem for this area remains
2425 * above about 1/32nd free.
2426 */
2427 if (type == ARC_BUFC_DATA && zio_arena != NULL &&
2428 vmem_size(zio_arena, VMEM_FREE) <
2429 (vmem_size(zio_arena, VMEM_ALLOC) >> 5))
2430 return (1);
2431#endif
2432
302f753f 2433 if (arc_no_grow)
34dc7c2f
BB
2434 return (1);
2435
2436 return (arc_size > arc_c);
2437}
2438
2439/*
2440 * The buffer, supplied as the first argument, needs a data block.
2441 * So, if we are at cache max, determine which cache should be victimized.
2442 * We have the following cases:
2443 *
2444 * 1. Insert for MRU, p > sizeof(arc_anon + arc_mru) ->
2445 * In this situation if we're out of space, but the resident size of the MFU is
2446 * under the limit, victimize the MFU cache to satisfy this insertion request.
2447 *
2448 * 2. Insert for MRU, p <= sizeof(arc_anon + arc_mru) ->
2449 * Here, we've used up all of the available space for the MRU, so we need to
2450 * evict from our own cache instead. Evict from the set of resident MRU
2451 * entries.
2452 *
2453 * 3. Insert for MFU (c - p) > sizeof(arc_mfu) ->
2454 * c minus p represents the MFU space in the cache, since p is the size of the
2455 * cache that is dedicated to the MRU. In this situation there's still space on
2456 * the MFU side, so the MRU side needs to be victimized.
2457 *
2458 * 4. Insert for MFU (c - p) < sizeof(arc_mfu) ->
2459 * MFU's resident set is consuming more space than it has been allotted. In
2460 * this situation, we must victimize our own cache, the MFU, for this insertion.
2461 */
2462static void
2463arc_get_data_buf(arc_buf_t *buf)
2464{
2465 arc_state_t *state = buf->b_hdr->b_state;
2466 uint64_t size = buf->b_hdr->b_size;
2467 arc_buf_contents_t type = buf->b_hdr->b_type;
2468
2469 arc_adapt(size, state);
2470
2471 /*
2472 * We have not yet reached cache maximum size,
2473 * just allocate a new buffer.
2474 */
2475 if (!arc_evict_needed(type)) {
2476 if (type == ARC_BUFC_METADATA) {
2477 buf->b_data = zio_buf_alloc(size);
d164b209 2478 arc_space_consume(size, ARC_SPACE_DATA);
34dc7c2f
BB
2479 } else {
2480 ASSERT(type == ARC_BUFC_DATA);
2481 buf->b_data = zio_data_buf_alloc(size);
d164b209 2482 ARCSTAT_INCR(arcstat_data_size, size);
34dc7c2f
BB
2483 atomic_add_64(&arc_size, size);
2484 }
2485 goto out;
2486 }
2487
2488 /*
2489 * If we are prefetching from the mfu ghost list, this buffer
2490 * will end up on the mru list; so steal space from there.
2491 */
2492 if (state == arc_mfu_ghost)
2493 state = buf->b_hdr->b_flags & ARC_PREFETCH ? arc_mru : arc_mfu;
2494 else if (state == arc_mru_ghost)
2495 state = arc_mru;
2496
2497 if (state == arc_mru || state == arc_anon) {
2498 uint64_t mru_used = arc_anon->arcs_size + arc_mru->arcs_size;
d164b209 2499 state = (arc_mfu->arcs_lsize[type] >= size &&
34dc7c2f
BB
2500 arc_p > mru_used) ? arc_mfu : arc_mru;
2501 } else {
2502 /* MFU cases */
2503 uint64_t mfu_space = arc_c - arc_p;
d164b209 2504 state = (arc_mru->arcs_lsize[type] >= size &&
34dc7c2f
BB
2505 mfu_space > arc_mfu->arcs_size) ? arc_mru : arc_mfu;
2506 }
ab26409d 2507
b8864a23 2508 if ((buf->b_data = arc_evict(state, 0, size, TRUE, type)) == NULL) {
34dc7c2f
BB
2509 if (type == ARC_BUFC_METADATA) {
2510 buf->b_data = zio_buf_alloc(size);
d164b209 2511 arc_space_consume(size, ARC_SPACE_DATA);
ab26409d
BB
2512
2513 /*
2514 * If we are unable to recycle an existing meta buffer
2515 * signal the reclaim thread. It will notify users
2516 * via the prune callback to drop references. The
2517 * prune callback in run in the context of the reclaim
2518 * thread to avoid deadlocking on the hash_lock.
2519 */
2520 cv_signal(&arc_reclaim_thr_cv);
34dc7c2f
BB
2521 } else {
2522 ASSERT(type == ARC_BUFC_DATA);
2523 buf->b_data = zio_data_buf_alloc(size);
d164b209 2524 ARCSTAT_INCR(arcstat_data_size, size);
34dc7c2f
BB
2525 atomic_add_64(&arc_size, size);
2526 }
ab26409d 2527
34dc7c2f
BB
2528 ARCSTAT_BUMP(arcstat_recycle_miss);
2529 }
2530 ASSERT(buf->b_data != NULL);
2531out:
2532 /*
2533 * Update the state size. Note that ghost states have a
2534 * "ghost size" and so don't need to be updated.
2535 */
2536 if (!GHOST_STATE(buf->b_hdr->b_state)) {
2537 arc_buf_hdr_t *hdr = buf->b_hdr;
2538
2539 atomic_add_64(&hdr->b_state->arcs_size, size);
2540 if (list_link_active(&hdr->b_arc_node)) {
2541 ASSERT(refcount_is_zero(&hdr->b_refcnt));
2542 atomic_add_64(&hdr->b_state->arcs_lsize[type], size);
2543 }
2544 /*
2545 * If we are growing the cache, and we are adding anonymous
2546 * data, and we have outgrown arc_p, update arc_p
2547 */
2548 if (arc_size < arc_c && hdr->b_state == arc_anon &&
2549 arc_anon->arcs_size + arc_mru->arcs_size > arc_p)
2550 arc_p = MIN(arc_c, arc_p + size);
2551 }
2552}
2553
2554/*
2555 * This routine is called whenever a buffer is accessed.
2556 * NOTE: the hash lock is dropped in this function.
2557 */
2558static void
2559arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock)
2560{
428870ff
BB
2561 clock_t now;
2562
34dc7c2f
BB
2563 ASSERT(MUTEX_HELD(hash_lock));
2564
2565 if (buf->b_state == arc_anon) {
2566 /*
2567 * This buffer is not in the cache, and does not
2568 * appear in our "ghost" list. Add the new buffer
2569 * to the MRU state.
2570 */
2571
2572 ASSERT(buf->b_arc_access == 0);
428870ff 2573 buf->b_arc_access = ddi_get_lbolt();
34dc7c2f
BB
2574 DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, buf);
2575 arc_change_state(arc_mru, buf, hash_lock);
2576
2577 } else if (buf->b_state == arc_mru) {
428870ff
BB
2578 now = ddi_get_lbolt();
2579
34dc7c2f
BB
2580 /*
2581 * If this buffer is here because of a prefetch, then either:
2582 * - clear the flag if this is a "referencing" read
2583 * (any subsequent access will bump this into the MFU state).
2584 * or
2585 * - move the buffer to the head of the list if this is
2586 * another prefetch (to make it less likely to be evicted).
2587 */
2588 if ((buf->b_flags & ARC_PREFETCH) != 0) {
2589 if (refcount_count(&buf->b_refcnt) == 0) {
2590 ASSERT(list_link_active(&buf->b_arc_node));
2591 } else {
2592 buf->b_flags &= ~ARC_PREFETCH;
2593 ARCSTAT_BUMP(arcstat_mru_hits);
2594 }
428870ff 2595 buf->b_arc_access = now;
34dc7c2f
BB
2596 return;
2597 }
2598
2599 /*
2600 * This buffer has been "accessed" only once so far,
2601 * but it is still in the cache. Move it to the MFU
2602 * state.
2603 */
428870ff 2604 if (now > buf->b_arc_access + ARC_MINTIME) {
34dc7c2f
BB
2605 /*
2606 * More than 125ms have passed since we
2607 * instantiated this buffer. Move it to the
2608 * most frequently used state.
2609 */
428870ff 2610 buf->b_arc_access = now;
34dc7c2f
BB
2611 DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf);
2612 arc_change_state(arc_mfu, buf, hash_lock);
2613 }
2614 ARCSTAT_BUMP(arcstat_mru_hits);
2615 } else if (buf->b_state == arc_mru_ghost) {
2616 arc_state_t *new_state;
2617 /*
2618 * This buffer has been "accessed" recently, but
2619 * was evicted from the cache. Move it to the
2620 * MFU state.
2621 */
2622
2623 if (buf->b_flags & ARC_PREFETCH) {
2624 new_state = arc_mru;
2625 if (refcount_count(&buf->b_refcnt) > 0)
2626 buf->b_flags &= ~ARC_PREFETCH;
2627 DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, buf);
2628 } else {
2629 new_state = arc_mfu;
2630 DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf);
2631 }
2632
428870ff 2633 buf->b_arc_access = ddi_get_lbolt();
34dc7c2f
BB
2634 arc_change_state(new_state, buf, hash_lock);
2635
2636 ARCSTAT_BUMP(arcstat_mru_ghost_hits);
2637 } else if (buf->b_state == arc_mfu) {
2638 /*
2639 * This buffer has been accessed more than once and is
2640 * still in the cache. Keep it in the MFU state.
2641 *
2642 * NOTE: an add_reference() that occurred when we did
2643 * the arc_read() will have kicked this off the list.
2644 * If it was a prefetch, we will explicitly move it to
2645 * the head of the list now.
2646 */
2647 if ((buf->b_flags & ARC_PREFETCH) != 0) {
2648 ASSERT(refcount_count(&buf->b_refcnt) == 0);
2649 ASSERT(list_link_active(&buf->b_arc_node));
2650 }
2651 ARCSTAT_BUMP(arcstat_mfu_hits);
428870ff 2652 buf->b_arc_access = ddi_get_lbolt();
34dc7c2f
BB
2653 } else if (buf->b_state == arc_mfu_ghost) {
2654 arc_state_t *new_state = arc_mfu;
2655 /*
2656 * This buffer has been accessed more than once but has
2657 * been evicted from the cache. Move it back to the
2658 * MFU state.
2659 */
2660
2661 if (buf->b_flags & ARC_PREFETCH) {
2662 /*
2663 * This is a prefetch access...
2664 * move this block back to the MRU state.
2665 */
2666 ASSERT3U(refcount_count(&buf->b_refcnt), ==, 0);
2667 new_state = arc_mru;
2668 }
2669
428870ff 2670 buf->b_arc_access = ddi_get_lbolt();
34dc7c2f
BB
2671 DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf);
2672 arc_change_state(new_state, buf, hash_lock);
2673
2674 ARCSTAT_BUMP(arcstat_mfu_ghost_hits);
2675 } else if (buf->b_state == arc_l2c_only) {
2676 /*
2677 * This buffer is on the 2nd Level ARC.
2678 */
2679
428870ff 2680 buf->b_arc_access = ddi_get_lbolt();
34dc7c2f
BB
2681 DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf);
2682 arc_change_state(arc_mfu, buf, hash_lock);
2683 } else {
2684 ASSERT(!"invalid arc state");
2685 }
2686}
2687
2688/* a generic arc_done_func_t which you can use */
2689/* ARGSUSED */
2690void
2691arc_bcopy_func(zio_t *zio, arc_buf_t *buf, void *arg)
2692{
428870ff
BB
2693 if (zio == NULL || zio->io_error == 0)
2694 bcopy(buf->b_data, arg, buf->b_hdr->b_size);
34dc7c2f
BB
2695 VERIFY(arc_buf_remove_ref(buf, arg) == 1);
2696}
2697
2698/* a generic arc_done_func_t */
2699void
2700arc_getbuf_func(zio_t *zio, arc_buf_t *buf, void *arg)
2701{
2702 arc_buf_t **bufp = arg;
2703 if (zio && zio->io_error) {
2704 VERIFY(arc_buf_remove_ref(buf, arg) == 1);
2705 *bufp = NULL;
2706 } else {
2707 *bufp = buf;
428870ff 2708 ASSERT(buf->b_data);
34dc7c2f
BB
2709 }
2710}
2711
2712static void
2713arc_read_done(zio_t *zio)
2714{
2715 arc_buf_hdr_t *hdr, *found;
2716 arc_buf_t *buf;
2717 arc_buf_t *abuf; /* buffer we're assigning to callback */
2718 kmutex_t *hash_lock;
2719 arc_callback_t *callback_list, *acb;
2720 int freeable = FALSE;
2721
2722 buf = zio->io_private;
2723 hdr = buf->b_hdr;
2724
2725 /*
2726 * The hdr was inserted into hash-table and removed from lists
2727 * prior to starting I/O. We should find this header, since
2728 * it's in the hash table, and it should be legit since it's
2729 * not possible to evict it during the I/O. The only possible
2730 * reason for it not to be found is if we were freed during the
2731 * read.
2732 */
d164b209 2733 found = buf_hash_find(hdr->b_spa, &hdr->b_dva, hdr->b_birth,
34dc7c2f
BB
2734 &hash_lock);
2735
2736 ASSERT((found == NULL && HDR_FREED_IN_READ(hdr) && hash_lock == NULL) ||
2737 (found == hdr && DVA_EQUAL(&hdr->b_dva, BP_IDENTITY(zio->io_bp))) ||
2738 (found == hdr && HDR_L2_READING(hdr)));
2739
b128c09f 2740 hdr->b_flags &= ~ARC_L2_EVICTED;
34dc7c2f 2741 if (l2arc_noprefetch && (hdr->b_flags & ARC_PREFETCH))
b128c09f 2742 hdr->b_flags &= ~ARC_L2CACHE;
34dc7c2f
BB
2743
2744 /* byteswap if necessary */
2745 callback_list = hdr->b_acb;
2746 ASSERT(callback_list != NULL);
428870ff 2747 if (BP_SHOULD_BYTESWAP(zio->io_bp) && zio->io_error == 0) {
b128c09f
BB
2748 arc_byteswap_func_t *func = BP_GET_LEVEL(zio->io_bp) > 0 ?
2749 byteswap_uint64_array :
2750 dmu_ot[BP_GET_TYPE(zio->io_bp)].ot_byteswap;
2751 func(buf->b_data, hdr->b_size);
2752 }
34dc7c2f
BB
2753
2754 arc_cksum_compute(buf, B_FALSE);
2755
428870ff
BB
2756 if (hash_lock && zio->io_error == 0 && hdr->b_state == arc_anon) {
2757 /*
2758 * Only call arc_access on anonymous buffers. This is because
2759 * if we've issued an I/O for an evicted buffer, we've already
2760 * called arc_access (to prevent any simultaneous readers from
2761 * getting confused).
2762 */
2763 arc_access(hdr, hash_lock);
2764 }
2765
34dc7c2f
BB
2766 /* create copies of the data buffer for the callers */
2767 abuf = buf;
2768 for (acb = callback_list; acb; acb = acb->acb_next) {
2769 if (acb->acb_done) {
2770 if (abuf == NULL)
2771 abuf = arc_buf_clone(buf);
2772 acb->acb_buf = abuf;
2773 abuf = NULL;
2774 }
2775 }
2776 hdr->b_acb = NULL;
2777 hdr->b_flags &= ~ARC_IO_IN_PROGRESS;
2778 ASSERT(!HDR_BUF_AVAILABLE(hdr));
428870ff
BB
2779 if (abuf == buf) {
2780 ASSERT(buf->b_efunc == NULL);
2781 ASSERT(hdr->b_datacnt == 1);
34dc7c2f 2782 hdr->b_flags |= ARC_BUF_AVAILABLE;
428870ff 2783 }
34dc7c2f
BB
2784
2785 ASSERT(refcount_is_zero(&hdr->b_refcnt) || callback_list != NULL);
2786
2787 if (zio->io_error != 0) {
2788 hdr->b_flags |= ARC_IO_ERROR;
2789 if (hdr->b_state != arc_anon)
2790 arc_change_state(arc_anon, hdr, hash_lock);
2791 if (HDR_IN_HASH_TABLE(hdr))
2792 buf_hash_remove(hdr);
2793 freeable = refcount_is_zero(&hdr->b_refcnt);
34dc7c2f
BB
2794 }
2795
2796 /*
2797 * Broadcast before we drop the hash_lock to avoid the possibility
2798 * that the hdr (and hence the cv) might be freed before we get to
2799 * the cv_broadcast().
2800 */
2801 cv_broadcast(&hdr->b_cv);
2802
2803 if (hash_lock) {
34dc7c2f
BB
2804 mutex_exit(hash_lock);
2805 } else {
2806 /*
2807 * This block was freed while we waited for the read to
2808 * complete. It has been removed from the hash table and
2809 * moved to the anonymous state (so that it won't show up
2810 * in the cache).
2811 */
2812 ASSERT3P(hdr->b_state, ==, arc_anon);
2813 freeable = refcount_is_zero(&hdr->b_refcnt);
2814 }
2815
2816 /* execute each callback and free its structure */
2817 while ((acb = callback_list) != NULL) {
2818 if (acb->acb_done)
2819 acb->acb_done(zio, acb->acb_buf, acb->acb_private);
2820
2821 if (acb->acb_zio_dummy != NULL) {
2822 acb->acb_zio_dummy->io_error = zio->io_error;
2823 zio_nowait(acb->acb_zio_dummy);
2824 }
2825
2826 callback_list = acb->acb_next;
2827 kmem_free(acb, sizeof (arc_callback_t));
2828 }
2829
2830 if (freeable)
2831 arc_hdr_destroy(hdr);
2832}
2833
2834/*
2835 * "Read" the block block at the specified DVA (in bp) via the
2836 * cache. If the block is found in the cache, invoke the provided
2837 * callback immediately and return. Note that the `zio' parameter
2838 * in the callback will be NULL in this case, since no IO was
2839 * required. If the block is not in the cache pass the read request
2840 * on to the spa with a substitute callback function, so that the
2841 * requested block will be added to the cache.
2842 *
2843 * If a read request arrives for a block that has a read in-progress,
2844 * either wait for the in-progress read to complete (and return the
2845 * results); or, if this is a read with a "done" func, add a record
2846 * to the read to invoke the "done" func when the read completes,
2847 * and return; or just return.
2848 *
2849 * arc_read_done() will invoke all the requested "done" functions
2850 * for readers of this block.
b128c09f
BB
2851 *
2852 * Normal callers should use arc_read and pass the arc buffer and offset
2853 * for the bp. But if you know you don't need locking, you can use
2854 * arc_read_bp.
34dc7c2f
BB
2855 */
2856int
428870ff 2857arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_buf_t *pbuf,
b128c09f
BB
2858 arc_done_func_t *done, void *private, int priority, int zio_flags,
2859 uint32_t *arc_flags, const zbookmark_t *zb)
2860{
2861 int err;
b128c09f 2862
428870ff
BB
2863 if (pbuf == NULL) {
2864 /*
2865 * XXX This happens from traverse callback funcs, for
2866 * the objset_phys_t block.
2867 */
2868 return (arc_read_nolock(pio, spa, bp, done, private, priority,
2869 zio_flags, arc_flags, zb));
2870 }
2871
b128c09f
BB
2872 ASSERT(!refcount_is_zero(&pbuf->b_hdr->b_refcnt));
2873 ASSERT3U((char *)bp - (char *)pbuf->b_data, <, pbuf->b_hdr->b_size);
428870ff 2874 rw_enter(&pbuf->b_data_lock, RW_READER);
b128c09f
BB
2875
2876 err = arc_read_nolock(pio, spa, bp, done, private, priority,
2877 zio_flags, arc_flags, zb);
428870ff 2878 rw_exit(&pbuf->b_data_lock);
9babb374 2879
b128c09f
BB
2880 return (err);
2881}
2882
2883int
428870ff 2884arc_read_nolock(zio_t *pio, spa_t *spa, const blkptr_t *bp,
b128c09f
BB
2885 arc_done_func_t *done, void *private, int priority, int zio_flags,
2886 uint32_t *arc_flags, const zbookmark_t *zb)
34dc7c2f
BB
2887{
2888 arc_buf_hdr_t *hdr;
d4ed6673 2889 arc_buf_t *buf = NULL;
34dc7c2f
BB
2890 kmutex_t *hash_lock;
2891 zio_t *rzio;
3541dc6d 2892 uint64_t guid = spa_load_guid(spa);
34dc7c2f
BB
2893
2894top:
428870ff
BB
2895 hdr = buf_hash_find(guid, BP_IDENTITY(bp), BP_PHYSICAL_BIRTH(bp),
2896 &hash_lock);
34dc7c2f
BB
2897 if (hdr && hdr->b_datacnt > 0) {
2898
2899 *arc_flags |= ARC_CACHED;
2900
2901 if (HDR_IO_IN_PROGRESS(hdr)) {
2902
2903 if (*arc_flags & ARC_WAIT) {
2904 cv_wait(&hdr->b_cv, hash_lock);
2905 mutex_exit(hash_lock);
2906 goto top;
2907 }
2908 ASSERT(*arc_flags & ARC_NOWAIT);
2909
2910 if (done) {
2911 arc_callback_t *acb = NULL;
2912
2913 acb = kmem_zalloc(sizeof (arc_callback_t),
691f6ac4 2914 KM_PUSHPAGE);
34dc7c2f
BB
2915 acb->acb_done = done;
2916 acb->acb_private = private;
34dc7c2f
BB
2917 if (pio != NULL)
2918 acb->acb_zio_dummy = zio_null(pio,
d164b209 2919 spa, NULL, NULL, NULL, zio_flags);
34dc7c2f
BB
2920
2921 ASSERT(acb->acb_done != NULL);
2922 acb->acb_next = hdr->b_acb;
2923 hdr->b_acb = acb;
2924 add_reference(hdr, hash_lock, private);
2925 mutex_exit(hash_lock);
2926 return (0);
2927 }
2928 mutex_exit(hash_lock);
2929 return (0);
2930 }
2931
2932 ASSERT(hdr->b_state == arc_mru || hdr->b_state == arc_mfu);
2933
2934 if (done) {
2935 add_reference(hdr, hash_lock, private);
2936 /*
2937 * If this block is already in use, create a new
2938 * copy of the data so that we will be guaranteed
2939 * that arc_release() will always succeed.
2940 */
2941 buf = hdr->b_buf;
2942 ASSERT(buf);
2943 ASSERT(buf->b_data);
2944 if (HDR_BUF_AVAILABLE(hdr)) {
2945 ASSERT(buf->b_efunc == NULL);
2946 hdr->b_flags &= ~ARC_BUF_AVAILABLE;
2947 } else {
2948 buf = arc_buf_clone(buf);
2949 }
428870ff 2950
34dc7c2f
BB
2951 } else if (*arc_flags & ARC_PREFETCH &&
2952 refcount_count(&hdr->b_refcnt) == 0) {
2953 hdr->b_flags |= ARC_PREFETCH;
2954 }
2955 DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr);
2956 arc_access(hdr, hash_lock);
b128c09f
BB
2957 if (*arc_flags & ARC_L2CACHE)
2958 hdr->b_flags |= ARC_L2CACHE;
34dc7c2f
BB
2959 mutex_exit(hash_lock);
2960 ARCSTAT_BUMP(arcstat_hits);
2961 ARCSTAT_CONDSTAT(!(hdr->b_flags & ARC_PREFETCH),
2962 demand, prefetch, hdr->b_type != ARC_BUFC_METADATA,
2963 data, metadata, hits);
2964
2965 if (done)
2966 done(NULL, buf, private);
2967 } else {
2968 uint64_t size = BP_GET_LSIZE(bp);
2969 arc_callback_t *acb;
b128c09f 2970 vdev_t *vd = NULL;
e06be586 2971 uint64_t addr = -1;
d164b209 2972 boolean_t devw = B_FALSE;
34dc7c2f
BB
2973
2974 if (hdr == NULL) {
2975 /* this block is not in the cache */
2976 arc_buf_hdr_t *exists;
2977 arc_buf_contents_t type = BP_GET_BUFC_TYPE(bp);
2978 buf = arc_buf_alloc(spa, size, private, type);
2979 hdr = buf->b_hdr;
2980 hdr->b_dva = *BP_IDENTITY(bp);
428870ff 2981 hdr->b_birth = BP_PHYSICAL_BIRTH(bp);
34dc7c2f
BB
2982 hdr->b_cksum0 = bp->blk_cksum.zc_word[0];
2983 exists = buf_hash_insert(hdr, &hash_lock);
2984 if (exists) {
2985 /* somebody beat us to the hash insert */
2986 mutex_exit(hash_lock);
428870ff 2987 buf_discard_identity(hdr);
34dc7c2f
BB
2988 (void) arc_buf_remove_ref(buf, private);
2989 goto top; /* restart the IO request */
2990 }
2991 /* if this is a prefetch, we don't have a reference */
2992 if (*arc_flags & ARC_PREFETCH) {
2993 (void) remove_reference(hdr, hash_lock,
2994 private);
2995 hdr->b_flags |= ARC_PREFETCH;
2996 }
b128c09f
BB
2997 if (*arc_flags & ARC_L2CACHE)
2998 hdr->b_flags |= ARC_L2CACHE;
34dc7c2f
BB
2999 if (BP_GET_LEVEL(bp) > 0)
3000 hdr->b_flags |= ARC_INDIRECT;
3001 } else {
3002 /* this block is in the ghost cache */
3003 ASSERT(GHOST_STATE(hdr->b_state));
3004 ASSERT(!HDR_IO_IN_PROGRESS(hdr));
3005 ASSERT3U(refcount_count(&hdr->b_refcnt), ==, 0);
3006 ASSERT(hdr->b_buf == NULL);
3007
3008 /* if this is a prefetch, we don't have a reference */
3009 if (*arc_flags & ARC_PREFETCH)
3010 hdr->b_flags |= ARC_PREFETCH;
3011 else
3012 add_reference(hdr, hash_lock, private);
b128c09f
BB
3013 if (*arc_flags & ARC_L2CACHE)
3014 hdr->b_flags |= ARC_L2CACHE;
34dc7c2f
BB
3015 buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE);
3016 buf->b_hdr = hdr;
3017 buf->b_data = NULL;
3018 buf->b_efunc = NULL;
3019 buf->b_private = NULL;
3020 buf->b_next = NULL;
3021 hdr->b_buf = buf;
34dc7c2f
BB
3022 ASSERT(hdr->b_datacnt == 0);
3023 hdr->b_datacnt = 1;
428870ff
BB
3024 arc_get_data_buf(buf);
3025 arc_access(hdr, hash_lock);
34dc7c2f
BB
3026 }
3027
428870ff
BB
3028 ASSERT(!GHOST_STATE(hdr->b_state));
3029
691f6ac4 3030 acb = kmem_zalloc(sizeof (arc_callback_t), KM_PUSHPAGE);
34dc7c2f
BB
3031 acb->acb_done = done;
3032 acb->acb_private = private;
34dc7c2f
BB
3033
3034 ASSERT(hdr->b_acb == NULL);
3035 hdr->b_acb = acb;
3036 hdr->b_flags |= ARC_IO_IN_PROGRESS;
3037
b128c09f
BB
3038 if (HDR_L2CACHE(hdr) && hdr->b_l2hdr != NULL &&
3039 (vd = hdr->b_l2hdr->b_dev->l2ad_vdev) != NULL) {
d164b209 3040 devw = hdr->b_l2hdr->b_dev->l2ad_writing;
b128c09f
BB
3041 addr = hdr->b_l2hdr->b_daddr;
3042 /*
3043 * Lock out device removal.
3044 */
3045 if (vdev_is_dead(vd) ||
3046 !spa_config_tryenter(spa, SCL_L2ARC, vd, RW_READER))
3047 vd = NULL;
3048 }
3049
3050 mutex_exit(hash_lock);
3051
34dc7c2f 3052 ASSERT3U(hdr->b_size, ==, size);
428870ff
BB
3053 DTRACE_PROBE4(arc__miss, arc_buf_hdr_t *, hdr, blkptr_t *, bp,
3054 uint64_t, size, zbookmark_t *, zb);
34dc7c2f
BB
3055 ARCSTAT_BUMP(arcstat_misses);
3056 ARCSTAT_CONDSTAT(!(hdr->b_flags & ARC_PREFETCH),
3057 demand, prefetch, hdr->b_type != ARC_BUFC_METADATA,
3058 data, metadata, misses);
3059
d164b209 3060 if (vd != NULL && l2arc_ndev != 0 && !(l2arc_norw && devw)) {
34dc7c2f
BB
3061 /*
3062 * Read from the L2ARC if the following are true:
b128c09f
BB
3063 * 1. The L2ARC vdev was previously cached.
3064 * 2. This buffer still has L2ARC metadata.
3065 * 3. This buffer isn't currently writing to the L2ARC.
3066 * 4. The L2ARC entry wasn't evicted, which may
3067 * also have invalidated the vdev.
d164b209 3068 * 5. This isn't prefetch and l2arc_noprefetch is set.
34dc7c2f 3069 */
b128c09f 3070 if (hdr->b_l2hdr != NULL &&
d164b209
BB
3071 !HDR_L2_WRITING(hdr) && !HDR_L2_EVICTED(hdr) &&
3072 !(l2arc_noprefetch && HDR_PREFETCH(hdr))) {
34dc7c2f
BB
3073 l2arc_read_callback_t *cb;
3074
3075 DTRACE_PROBE1(l2arc__hit, arc_buf_hdr_t *, hdr);
3076 ARCSTAT_BUMP(arcstat_l2_hits);
3077
34dc7c2f 3078 cb = kmem_zalloc(sizeof (l2arc_read_callback_t),
691f6ac4 3079 KM_PUSHPAGE);
34dc7c2f
BB
3080 cb->l2rcb_buf = buf;
3081 cb->l2rcb_spa = spa;
3082 cb->l2rcb_bp = *bp;
3083 cb->l2rcb_zb = *zb;
b128c09f 3084 cb->l2rcb_flags = zio_flags;
34dc7c2f
BB
3085
3086 /*
b128c09f
BB
3087 * l2arc read. The SCL_L2ARC lock will be
3088 * released by l2arc_read_done().
34dc7c2f
BB
3089 */
3090 rzio = zio_read_phys(pio, vd, addr, size,
3091 buf->b_data, ZIO_CHECKSUM_OFF,
b128c09f
BB
3092 l2arc_read_done, cb, priority, zio_flags |
3093 ZIO_FLAG_DONT_CACHE | ZIO_FLAG_CANFAIL |
3094 ZIO_FLAG_DONT_PROPAGATE |
3095 ZIO_FLAG_DONT_RETRY, B_FALSE);
34dc7c2f
BB
3096 DTRACE_PROBE2(l2arc__read, vdev_t *, vd,
3097 zio_t *, rzio);
d164b209 3098 ARCSTAT_INCR(arcstat_l2_read_bytes, size);
34dc7c2f 3099
b128c09f
BB
3100 if (*arc_flags & ARC_NOWAIT) {
3101 zio_nowait(rzio);
3102 return (0);
3103 }
34dc7c2f 3104
b128c09f
BB
3105 ASSERT(*arc_flags & ARC_WAIT);
3106 if (zio_wait(rzio) == 0)
3107 return (0);
3108
3109 /* l2arc read error; goto zio_read() */
34dc7c2f
BB
3110 } else {
3111 DTRACE_PROBE1(l2arc__miss,
3112 arc_buf_hdr_t *, hdr);
3113 ARCSTAT_BUMP(arcstat_l2_misses);
3114 if (HDR_L2_WRITING(hdr))
3115 ARCSTAT_BUMP(arcstat_l2_rw_clash);
b128c09f 3116 spa_config_exit(spa, SCL_L2ARC, vd);
34dc7c2f 3117 }
d164b209
BB
3118 } else {
3119 if (vd != NULL)
3120 spa_config_exit(spa, SCL_L2ARC, vd);
3121 if (l2arc_ndev != 0) {
3122 DTRACE_PROBE1(l2arc__miss,
3123 arc_buf_hdr_t *, hdr);
3124 ARCSTAT_BUMP(arcstat_l2_misses);
3125 }
34dc7c2f 3126 }
34dc7c2f
BB
3127
3128 rzio = zio_read(pio, spa, bp, buf->b_data, size,
b128c09f 3129 arc_read_done, buf, priority, zio_flags, zb);
34dc7c2f
BB
3130
3131 if (*arc_flags & ARC_WAIT)
3132 return (zio_wait(rzio));
3133
3134 ASSERT(*arc_flags & ARC_NOWAIT);
3135 zio_nowait(rzio);
3136 }
3137 return (0);
3138}
3139
ab26409d
BB
3140arc_prune_t *
3141arc_add_prune_callback(arc_prune_func_t *func, void *private)
3142{
3143 arc_prune_t *p;
3144
3145 p = kmem_alloc(sizeof(*p), KM_SLEEP);
3146 p->p_pfunc = func;
3147 p->p_private = private;
3148 list_link_init(&p->p_node);
3149 refcount_create(&p->p_refcnt);
3150
3151 mutex_enter(&arc_prune_mtx);
3152 refcount_add(&p->p_refcnt, &arc_prune_list);
3153 list_insert_head(&arc_prune_list, p);
3154 mutex_exit(&arc_prune_mtx);
3155
3156 return (p);
3157}
3158
3159void
3160arc_remove_prune_callback(arc_prune_t *p)
3161{
3162 mutex_enter(&arc_prune_mtx);
3163 list_remove(&arc_prune_list, p);
3164 if (refcount_remove(&p->p_refcnt, &arc_prune_list) == 0) {
3165 refcount_destroy(&p->p_refcnt);
3166 kmem_free(p, sizeof (*p));
3167 }
3168 mutex_exit(&arc_prune_mtx);
3169}
3170
34dc7c2f
BB
3171void
3172arc_set_callback(arc_buf_t *buf, arc_evict_func_t *func, void *private)
3173{
3174 ASSERT(buf->b_hdr != NULL);
3175 ASSERT(buf->b_hdr->b_state != arc_anon);
3176 ASSERT(!refcount_is_zero(&buf->b_hdr->b_refcnt) || func == NULL);
428870ff
BB
3177 ASSERT(buf->b_efunc == NULL);
3178 ASSERT(!HDR_BUF_AVAILABLE(buf->b_hdr));
3179
34dc7c2f
BB
3180 buf->b_efunc = func;
3181 buf->b_private = private;
3182}
3183
3184/*
3185 * This is used by the DMU to let the ARC know that a buffer is
3186 * being evicted, so the ARC should clean up. If this arc buf
3187 * is not yet in the evicted state, it will be put there.
3188 */
3189int
3190arc_buf_evict(arc_buf_t *buf)
3191{
3192 arc_buf_hdr_t *hdr;
3193 kmutex_t *hash_lock;
3194 arc_buf_t **bufp;
3195
428870ff 3196 mutex_enter(&buf->b_evict_lock);
34dc7c2f
BB
3197 hdr = buf->b_hdr;
3198 if (hdr == NULL) {
3199 /*
3200 * We are in arc_do_user_evicts().
3201 */
3202 ASSERT(buf->b_data == NULL);
428870ff 3203 mutex_exit(&buf->b_evict_lock);
34dc7c2f 3204 return (0);
b128c09f
BB
3205 } else if (buf->b_data == NULL) {
3206 arc_buf_t copy = *buf; /* structure assignment */
34dc7c2f 3207 /*
b128c09f
BB
3208 * We are on the eviction list; process this buffer now
3209 * but let arc_do_user_evicts() do the reaping.
34dc7c2f 3210 */
b128c09f 3211 buf->b_efunc = NULL;
428870ff 3212 mutex_exit(&buf->b_evict_lock);
b128c09f
BB
3213 VERIFY(copy.b_efunc(&copy) == 0);
3214 return (1);
34dc7c2f 3215 }
b128c09f
BB
3216 hash_lock = HDR_LOCK(hdr);
3217 mutex_enter(hash_lock);
428870ff
BB
3218 hdr = buf->b_hdr;
3219 ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
34dc7c2f 3220
34dc7c2f
BB
3221 ASSERT3U(refcount_count(&hdr->b_refcnt), <, hdr->b_datacnt);
3222 ASSERT(hdr->b_state == arc_mru || hdr->b_state == arc_mfu);
3223
3224 /*
3225 * Pull this buffer off of the hdr
3226 */
3227 bufp = &hdr->b_buf;
3228 while (*bufp != buf)
3229 bufp = &(*bufp)->b_next;
3230 *bufp = buf->b_next;
3231
3232 ASSERT(buf->b_data != NULL);
3233 arc_buf_destroy(buf, FALSE, FALSE);
3234
3235 if (hdr->b_datacnt == 0) {
3236 arc_state_t *old_state = hdr->b_state;
3237 arc_state_t *evicted_state;
3238
428870ff 3239 ASSERT(hdr->b_buf == NULL);
34dc7c2f
BB
3240 ASSERT(refcount_is_zero(&hdr->b_refcnt));
3241
3242 evicted_state =
3243 (old_state == arc_mru) ? arc_mru_ghost : arc_mfu_ghost;
3244
3245 mutex_enter(&old_state->arcs_mtx);
3246 mutex_enter(&evicted_state->arcs_mtx);
3247
3248 arc_change_state(evicted_state, hdr, hash_lock);
3249 ASSERT(HDR_IN_HASH_TABLE(hdr));
3250 hdr->b_flags |= ARC_IN_HASH_TABLE;
3251 hdr->b_flags &= ~ARC_BUF_AVAILABLE;
3252
3253 mutex_exit(&evicted_state->arcs_mtx);
3254 mutex_exit(&old_state->arcs_mtx);
3255 }
3256 mutex_exit(hash_lock);
428870ff 3257 mutex_exit(&buf->b_evict_lock);
34dc7c2f
BB
3258
3259 VERIFY(buf->b_efunc(buf) == 0);
3260 buf->b_efunc = NULL;
3261 buf->b_private = NULL;
3262 buf->b_hdr = NULL;
428870ff 3263 buf->b_next = NULL;
34dc7c2f
BB
3264 kmem_cache_free(buf_cache, buf);
3265 return (1);
3266}
3267
3268/*
3269 * Release this buffer from the cache. This must be done
3270 * after a read and prior to modifying the buffer contents.
3271 * If the buffer has more than one reference, we must make
b128c09f 3272 * a new hdr for the buffer.
34dc7c2f
BB
3273 */
3274void
3275arc_release(arc_buf_t *buf, void *tag)
3276{
b128c09f 3277 arc_buf_hdr_t *hdr;
428870ff 3278 kmutex_t *hash_lock = NULL;
b128c09f 3279 l2arc_buf_hdr_t *l2hdr;
d4ed6673 3280 uint64_t buf_size = 0;
34dc7c2f 3281
428870ff
BB
3282 /*
3283 * It would be nice to assert that if it's DMU metadata (level >
3284 * 0 || it's the dnode file), then it must be syncing context.
3285 * But we don't know that information at this level.
3286 */
3287
3288 mutex_enter(&buf->b_evict_lock);
b128c09f
BB
3289 hdr = buf->b_hdr;
3290
34dc7c2f
BB
3291 /* this buffer is not on any list */
3292 ASSERT(refcount_count(&hdr->b_refcnt) > 0);
3293
3294 if (hdr->b_state == arc_anon) {
3295 /* this buffer is already released */
34dc7c2f 3296 ASSERT(buf->b_efunc == NULL);
9babb374
BB
3297 } else {
3298 hash_lock = HDR_LOCK(hdr);
3299 mutex_enter(hash_lock);
428870ff
BB
3300 hdr = buf->b_hdr;
3301 ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
34dc7c2f
BB
3302 }
3303
b128c09f
BB
3304 l2hdr = hdr->b_l2hdr;
3305 if (l2hdr) {
3306 mutex_enter(&l2arc_buflist_mtx);
3307 hdr->b_l2hdr = NULL;
3308 buf_size = hdr->b_size;
3309 }
3310
34dc7c2f
BB
3311 /*
3312 * Do we have more than one buf?
3313 */
b128c09f 3314 if (hdr->b_datacnt > 1) {
34dc7c2f
BB
3315 arc_buf_hdr_t *nhdr;
3316 arc_buf_t **bufp;
3317 uint64_t blksz = hdr->b_size;
d164b209 3318 uint64_t spa = hdr->b_spa;
34dc7c2f
BB
3319 arc_buf_contents_t type = hdr->b_type;
3320 uint32_t flags = hdr->b_flags;
3321
b128c09f 3322 ASSERT(hdr->b_buf != buf || buf->b_next != NULL);
34dc7c2f 3323 /*
428870ff
BB
3324 * Pull the data off of this hdr and attach it to
3325 * a new anonymous hdr.
34dc7c2f
BB
3326 */
3327 (void) remove_reference(hdr, hash_lock, tag);
3328 bufp = &hdr->b_buf;
3329 while (*bufp != buf)
3330 bufp = &(*bufp)->b_next;
428870ff 3331 *bufp = buf->b_next;
34dc7c2f
BB
3332 buf->b_next = NULL;
3333
3334 ASSERT3U(hdr->b_state->arcs_size, >=, hdr->b_size);
3335 atomic_add_64(&hdr->b_state->arcs_size, -hdr->b_size);
3336 if (refcount_is_zero(&hdr->b_refcnt)) {
3337 uint64_t *size = &hdr->b_state->arcs_lsize[hdr->b_type];
3338 ASSERT3U(*size, >=, hdr->b_size);
3339 atomic_add_64(size, -hdr->b_size);
3340 }
3341 hdr->b_datacnt -= 1;
34dc7c2f
BB
3342 arc_cksum_verify(buf);
3343
3344 mutex_exit(hash_lock);
3345
3346 nhdr = kmem_cache_alloc(hdr_cache, KM_PUSHPAGE);
3347 nhdr->b_size = blksz;
3348 nhdr->b_spa = spa;
3349 nhdr->b_type = type;
3350 nhdr->b_buf = buf;
3351 nhdr->b_state = arc_anon;
3352 nhdr->b_arc_access = 0;
3353 nhdr->b_flags = flags & ARC_L2_WRITING;
3354 nhdr->b_l2hdr = NULL;
3355 nhdr->b_datacnt = 1;
3356 nhdr->b_freeze_cksum = NULL;
3357 (void) refcount_add(&nhdr->b_refcnt, tag);
3358 buf->b_hdr = nhdr;
428870ff 3359 mutex_exit(&buf->b_evict_lock);
34dc7c2f
BB
3360 atomic_add_64(&arc_anon->arcs_size, blksz);
3361 } else {
428870ff 3362 mutex_exit(&buf->b_evict_lock);
34dc7c2f
BB
3363 ASSERT(refcount_count(&hdr->b_refcnt) == 1);
3364 ASSERT(!list_link_active(&hdr->b_arc_node));
3365 ASSERT(!HDR_IO_IN_PROGRESS(hdr));
428870ff
BB
3366 if (hdr->b_state != arc_anon)
3367 arc_change_state(arc_anon, hdr, hash_lock);
34dc7c2f 3368 hdr->b_arc_access = 0;
428870ff
BB
3369 if (hash_lock)
3370 mutex_exit(hash_lock);
34dc7c2f 3371
428870ff 3372 buf_discard_identity(hdr);
34dc7c2f
BB
3373 arc_buf_thaw(buf);
3374 }
3375 buf->b_efunc = NULL;
3376 buf->b_private = NULL;
3377
3378 if (l2hdr) {
3379 list_remove(l2hdr->b_dev->l2ad_buflist, hdr);
3380 kmem_free(l2hdr, sizeof (l2arc_buf_hdr_t));
3381 ARCSTAT_INCR(arcstat_l2_size, -buf_size);
34dc7c2f 3382 mutex_exit(&l2arc_buflist_mtx);
b128c09f 3383 }
34dc7c2f
BB
3384}
3385
428870ff
BB
3386/*
3387 * Release this buffer. If it does not match the provided BP, fill it
3388 * with that block's contents.
3389 */
3390/* ARGSUSED */
3391int
3392arc_release_bp(arc_buf_t *buf, void *tag, blkptr_t *bp, spa_t *spa,
3393 zbookmark_t *zb)
3394{
3395 arc_release(buf, tag);
3396 return (0);
3397}
3398
34dc7c2f
BB
3399int
3400arc_released(arc_buf_t *buf)
3401{
b128c09f
BB
3402 int released;
3403
428870ff 3404 mutex_enter(&buf->b_evict_lock);
b128c09f 3405 released = (buf->b_data != NULL && buf->b_hdr->b_state == arc_anon);
428870ff 3406 mutex_exit(&buf->b_evict_lock);
b128c09f 3407 return (released);
34dc7c2f
BB
3408}
3409
3410int
3411arc_has_callback(arc_buf_t *buf)
3412{
b128c09f
BB
3413 int callback;
3414
428870ff 3415 mutex_enter(&buf->b_evict_lock);
b128c09f 3416 callback = (buf->b_efunc != NULL);
428870ff 3417 mutex_exit(&buf->b_evict_lock);
b128c09f 3418 return (callback);
34dc7c2f
BB
3419}
3420
3421#ifdef ZFS_DEBUG
3422int
3423arc_referenced(arc_buf_t *buf)
3424{
b128c09f
BB
3425 int referenced;
3426
428870ff 3427 mutex_enter(&buf->b_evict_lock);
b128c09f 3428 referenced = (refcount_count(&buf->b_hdr->b_refcnt));
428870ff 3429 mutex_exit(&buf->b_evict_lock);
b128c09f 3430 return (referenced);
34dc7c2f
BB
3431}
3432#endif
3433
3434static void
3435arc_write_ready(zio_t *zio)
3436{
3437 arc_write_callback_t *callback = zio->io_private;
3438 arc_buf_t *buf = callback->awcb_buf;
3439 arc_buf_hdr_t *hdr = buf->b_hdr;
3440
b128c09f
BB
3441 ASSERT(!refcount_is_zero(&buf->b_hdr->b_refcnt));
3442 callback->awcb_ready(zio, buf, callback->awcb_private);
3443
34dc7c2f
BB
3444 /*
3445 * If the IO is already in progress, then this is a re-write
b128c09f
BB
3446 * attempt, so we need to thaw and re-compute the cksum.
3447 * It is the responsibility of the callback to handle the
3448 * accounting for any re-write attempt.
34dc7c2f
BB
3449 */
3450 if (HDR_IO_IN_PROGRESS(hdr)) {
34dc7c2f
BB
3451 mutex_enter(&hdr->b_freeze_lock);
3452 if (hdr->b_freeze_cksum != NULL) {
3453 kmem_free(hdr->b_freeze_cksum, sizeof (zio_cksum_t));
3454 hdr->b_freeze_cksum = NULL;
3455 }
3456 mutex_exit(&hdr->b_freeze_lock);
3457 }
3458 arc_cksum_compute(buf, B_FALSE);
3459 hdr->b_flags |= ARC_IO_IN_PROGRESS;
3460}
3461
3462static void
3463arc_write_done(zio_t *zio)
3464{
3465 arc_write_callback_t *callback = zio->io_private;
3466 arc_buf_t *buf = callback->awcb_buf;
3467 arc_buf_hdr_t *hdr = buf->b_hdr;
3468
428870ff
BB
3469 ASSERT(hdr->b_acb == NULL);
3470
3471 if (zio->io_error == 0) {
3472 hdr->b_dva = *BP_IDENTITY(zio->io_bp);
3473 hdr->b_birth = BP_PHYSICAL_BIRTH(zio->io_bp);
3474 hdr->b_cksum0 = zio->io_bp->blk_cksum.zc_word[0];
3475 } else {
3476 ASSERT(BUF_EMPTY(hdr));
3477 }
34dc7c2f 3478
34dc7c2f
BB
3479 /*
3480 * If the block to be written was all-zero, we may have
3481 * compressed it away. In this case no write was performed
428870ff
BB
3482 * so there will be no dva/birth/checksum. The buffer must
3483 * therefore remain anonymous (and uncached).
34dc7c2f
BB
3484 */
3485 if (!BUF_EMPTY(hdr)) {
3486 arc_buf_hdr_t *exists;
3487 kmutex_t *hash_lock;
3488
428870ff
BB
3489 ASSERT(zio->io_error == 0);
3490
34dc7c2f
BB
3491 arc_cksum_verify(buf);
3492
3493 exists = buf_hash_insert(hdr, &hash_lock);
3494 if (exists) {
3495 /*
3496 * This can only happen if we overwrite for
3497 * sync-to-convergence, because we remove
3498 * buffers from the hash table when we arc_free().
3499 */
428870ff
BB
3500 if (zio->io_flags & ZIO_FLAG_IO_REWRITE) {
3501 if (!BP_EQUAL(&zio->io_bp_orig, zio->io_bp))
3502 panic("bad overwrite, hdr=%p exists=%p",
3503 (void *)hdr, (void *)exists);
3504 ASSERT(refcount_is_zero(&exists->b_refcnt));
3505 arc_change_state(arc_anon, exists, hash_lock);
3506 mutex_exit(hash_lock);
3507 arc_hdr_destroy(exists);
3508 exists = buf_hash_insert(hdr, &hash_lock);
3509 ASSERT3P(exists, ==, NULL);
3510 } else {
3511 /* Dedup */
3512 ASSERT(hdr->b_datacnt == 1);
3513 ASSERT(hdr->b_state == arc_anon);
3514 ASSERT(BP_GET_DEDUP(zio->io_bp));
3515 ASSERT(BP_GET_LEVEL(zio->io_bp) == 0);
3516 }
34dc7c2f
BB
3517 }
3518 hdr->b_flags &= ~ARC_IO_IN_PROGRESS;
b128c09f 3519 /* if it's not anon, we are doing a scrub */
428870ff 3520 if (!exists && hdr->b_state == arc_anon)
b128c09f 3521 arc_access(hdr, hash_lock);
34dc7c2f 3522 mutex_exit(hash_lock);
34dc7c2f
BB
3523 } else {
3524 hdr->b_flags &= ~ARC_IO_IN_PROGRESS;
3525 }
3526
428870ff
BB
3527 ASSERT(!refcount_is_zero(&hdr->b_refcnt));
3528 callback->awcb_done(zio, buf, callback->awcb_private);
34dc7c2f
BB
3529
3530 kmem_free(callback, sizeof (arc_write_callback_t));
3531}
3532
3533zio_t *
428870ff
BB
3534arc_write(zio_t *pio, spa_t *spa, uint64_t txg,
3535 blkptr_t *bp, arc_buf_t *buf, boolean_t l2arc, const zio_prop_t *zp,
3536 arc_done_func_t *ready, arc_done_func_t *done, void *private,
3537 int priority, int zio_flags, const zbookmark_t *zb)
34dc7c2f
BB
3538{
3539 arc_buf_hdr_t *hdr = buf->b_hdr;
3540 arc_write_callback_t *callback;
b128c09f 3541 zio_t *zio;
34dc7c2f 3542
b128c09f 3543 ASSERT(ready != NULL);
428870ff 3544 ASSERT(done != NULL);
34dc7c2f
BB
3545 ASSERT(!HDR_IO_ERROR(hdr));
3546 ASSERT((hdr->b_flags & ARC_IO_IN_PROGRESS) == 0);
428870ff 3547 ASSERT(hdr->b_acb == NULL);
b128c09f
BB
3548 if (l2arc)
3549 hdr->b_flags |= ARC_L2CACHE;
34dc7c2f
BB
3550 callback = kmem_zalloc(sizeof (arc_write_callback_t), KM_SLEEP);
3551 callback->awcb_ready = ready;
3552 callback->awcb_done = done;
3553 callback->awcb_private = private;
3554 callback->awcb_buf = buf;
b128c09f 3555
428870ff 3556 zio = zio_write(pio, spa, txg, bp, buf->b_data, hdr->b_size, zp,
b128c09f 3557 arc_write_ready, arc_write_done, callback, priority, zio_flags, zb);
34dc7c2f
BB
3558
3559 return (zio);
3560}
3561
34dc7c2f 3562static int
9babb374 3563arc_memory_throttle(uint64_t reserve, uint64_t inflight_data, uint64_t txg)
34dc7c2f
BB
3564{
3565#ifdef _KERNEL
302f753f 3566 uint64_t available_memory;
34dc7c2f 3567
302f753f
BB
3568 /* Easily reclaimable memory (free + inactive + arc-evictable) */
3569 available_memory = ptob(spl_kmem_availrmem()) + arc_evictable_memory();
34dc7c2f
BB
3570#if defined(__i386)
3571 available_memory =
3572 MIN(available_memory, vmem_size(heap_arena, VMEM_FREE));
3573#endif
34dc7c2f 3574
302f753f 3575 if (available_memory <= zfs_write_limit_max) {
34dc7c2f 3576 ARCSTAT_INCR(arcstat_memory_throttle_count, 1);
570827e1 3577 DMU_TX_STAT_BUMP(dmu_tx_memory_reclaim);
34dc7c2f
BB
3578 return (EAGAIN);
3579 }
34dc7c2f
BB
3580
3581 if (inflight_data > available_memory / 4) {
3582 ARCSTAT_INCR(arcstat_memory_throttle_count, 1);
570827e1 3583 DMU_TX_STAT_BUMP(dmu_tx_memory_inflight);
34dc7c2f
BB
3584 return (ERESTART);
3585 }
3586#endif
3587 return (0);
3588}
3589
3590void
3591arc_tempreserve_clear(uint64_t reserve)
3592{
3593 atomic_add_64(&arc_tempreserve, -reserve);
3594 ASSERT((int64_t)arc_tempreserve >= 0);
3595}
3596
3597int
3598arc_tempreserve_space(uint64_t reserve, uint64_t txg)
3599{
3600 int error;
9babb374 3601 uint64_t anon_size;
34dc7c2f
BB
3602
3603#ifdef ZFS_DEBUG
3604 /*
3605 * Once in a while, fail for no reason. Everything should cope.
3606 */
3607 if (spa_get_random(10000) == 0) {
3608 dprintf("forcing random failure\n");
3609 return (ERESTART);
3610 }
3611#endif
3612 if (reserve > arc_c/4 && !arc_no_grow)
3613 arc_c = MIN(arc_c_max, reserve * 4);
570827e1
BB
3614 if (reserve > arc_c) {
3615 DMU_TX_STAT_BUMP(dmu_tx_memory_reserve);
34dc7c2f 3616 return (ENOMEM);
570827e1 3617 }
34dc7c2f 3618
9babb374
BB
3619 /*
3620 * Don't count loaned bufs as in flight dirty data to prevent long
3621 * network delays from blocking transactions that are ready to be
3622 * assigned to a txg.
3623 */
3624 anon_size = MAX((int64_t)(arc_anon->arcs_size - arc_loaned_bytes), 0);
3625
34dc7c2f
BB
3626 /*
3627 * Writes will, almost always, require additional memory allocations
3628 * in order to compress/encrypt/etc the data. We therefor need to
3629 * make sure that there is sufficient available memory for this.
3630 */
c65aa5b2 3631 if ((error = arc_memory_throttle(reserve, anon_size, txg)))
34dc7c2f
BB
3632 return (error);
3633
3634 /*
3635 * Throttle writes when the amount of dirty data in the cache
3636 * gets too large. We try to keep the cache less than half full
3637 * of dirty blocks so that our sync times don't grow too large.
3638 * Note: if two requests come in concurrently, we might let them
3639 * both succeed, when one of them should fail. Not a huge deal.
3640 */
9babb374
BB
3641
3642 if (reserve + arc_tempreserve + anon_size > arc_c / 2 &&
3643 anon_size > arc_c / 4) {
34dc7c2f
BB
3644 dprintf("failing, arc_tempreserve=%lluK anon_meta=%lluK "
3645 "anon_data=%lluK tempreserve=%lluK arc_c=%lluK\n",
3646 arc_tempreserve>>10,
3647 arc_anon->arcs_lsize[ARC_BUFC_METADATA]>>10,
3648 arc_anon->arcs_lsize[ARC_BUFC_DATA]>>10,
3649 reserve>>10, arc_c>>10);
570827e1 3650 DMU_TX_STAT_BUMP(dmu_tx_dirty_throttle);
34dc7c2f
BB
3651 return (ERESTART);
3652 }
3653 atomic_add_64(&arc_tempreserve, reserve);
3654 return (0);
3655}
3656
13be560d
BB
3657static void
3658arc_kstat_update_state(arc_state_t *state, kstat_named_t *size,
3659 kstat_named_t *evict_data, kstat_named_t *evict_metadata)
3660{
3661 size->value.ui64 = state->arcs_size;
3662 evict_data->value.ui64 = state->arcs_lsize[ARC_BUFC_DATA];
3663 evict_metadata->value.ui64 = state->arcs_lsize[ARC_BUFC_METADATA];
3664}
3665
3666static int
3667arc_kstat_update(kstat_t *ksp, int rw)
3668{
3669 arc_stats_t *as = ksp->ks_data;
3670
3671 if (rw == KSTAT_WRITE) {
3672 return (EACCES);
3673 } else {
3674 arc_kstat_update_state(arc_anon,
3675 &as->arcstat_anon_size,
3676 &as->arcstat_anon_evict_data,
3677 &as->arcstat_anon_evict_metadata);
3678 arc_kstat_update_state(arc_mru,
3679 &as->arcstat_mru_size,
3680 &as->arcstat_mru_evict_data,
3681 &as->arcstat_mru_evict_metadata);
3682 arc_kstat_update_state(arc_mru_ghost,
3683 &as->arcstat_mru_ghost_size,
3684 &as->arcstat_mru_ghost_evict_data,
3685 &as->arcstat_mru_ghost_evict_metadata);
3686 arc_kstat_update_state(arc_mfu,
3687 &as->arcstat_mfu_size,
3688 &as->arcstat_mfu_evict_data,
3689 &as->arcstat_mfu_evict_metadata);
fc41c640 3690 arc_kstat_update_state(arc_mfu_ghost,
13be560d
BB
3691 &as->arcstat_mfu_ghost_size,
3692 &as->arcstat_mfu_ghost_evict_data,
3693 &as->arcstat_mfu_ghost_evict_metadata);
3694 }
3695
3696 return (0);
3697}
3698
34dc7c2f
BB
3699void
3700arc_init(void)
3701{
3702 mutex_init(&arc_reclaim_thr_lock, NULL, MUTEX_DEFAULT, NULL);
3703 cv_init(&arc_reclaim_thr_cv, NULL, CV_DEFAULT, NULL);
3704
3705 /* Convert seconds to clock ticks */
3706 arc_min_prefetch_lifespan = 1 * hz;
3707
3708 /* Start out with 1/8 of all memory */
3709 arc_c = physmem * PAGESIZE / 8;
3710
3711#ifdef _KERNEL
3712 /*
3713 * On architectures where the physical memory can be larger
3714 * than the addressable space (intel in 32-bit mode), we may
3715 * need to limit the cache to 1/8 of VM size.
3716 */
3717 arc_c = MIN(arc_c, vmem_size(heap_arena, VMEM_ALLOC | VMEM_FREE) / 8);
7cb67b45
BB
3718 /*
3719 * Register a shrinker to support synchronous (direct) memory
3720 * reclaim from the arc. This is done to prevent kswapd from
3721 * swapping out pages when it is preferable to shrink the arc.
3722 */
3723 spl_register_shrinker(&arc_shrinker);
34dc7c2f
BB
3724#endif
3725
3726 /* set min cache to 1/32 of all memory, or 64MB, whichever is more */
3727 arc_c_min = MAX(arc_c / 4, 64<<20);
518b4876 3728 /* set max to 1/2 of all memory */
23bdb07d 3729 arc_c_max = MAX(arc_c * 4, arc_c_max);
34dc7c2f
BB
3730
3731 /*
3732 * Allow the tunables to override our calculations if they are
3733 * reasonable (ie. over 64MB)
3734 */
3735 if (zfs_arc_max > 64<<20 && zfs_arc_max < physmem * PAGESIZE)
3736 arc_c_max = zfs_arc_max;
3737 if (zfs_arc_min > 64<<20 && zfs_arc_min <= arc_c_max)
3738 arc_c_min = zfs_arc_min;
3739
3740 arc_c = arc_c_max;
3741 arc_p = (arc_c >> 1);
3742
3743 /* limit meta-data to 1/4 of the arc capacity */
3744 arc_meta_limit = arc_c_max / 4;
1834f2d8 3745 arc_meta_max = 0;
34dc7c2f
BB
3746
3747 /* Allow the tunable to override if it is reasonable */
3748 if (zfs_arc_meta_limit > 0 && zfs_arc_meta_limit <= arc_c_max)
3749 arc_meta_limit = zfs_arc_meta_limit;
3750
3751 if (arc_c_min < arc_meta_limit / 2 && zfs_arc_min == 0)
3752 arc_c_min = arc_meta_limit / 2;
3753
d164b209
BB
3754 if (zfs_arc_grow_retry > 0)
3755 arc_grow_retry = zfs_arc_grow_retry;
3756
3757 if (zfs_arc_shrink_shift > 0)
3758 arc_shrink_shift = zfs_arc_shrink_shift;
3759
3760 if (zfs_arc_p_min_shift > 0)
3761 arc_p_min_shift = zfs_arc_p_min_shift;
3762
ab26409d
BB
3763 if (zfs_arc_meta_prune > 0)
3764 arc_meta_prune = zfs_arc_meta_prune;
6a8f9b6b 3765
34dc7c2f
BB
3766 /* if kmem_flags are set, lets try to use less memory */
3767 if (kmem_debugging())
3768 arc_c = arc_c / 2;
3769 if (arc_c < arc_c_min)
3770 arc_c = arc_c_min;
3771
3772 arc_anon = &ARC_anon;
3773 arc_mru = &ARC_mru;
3774 arc_mru_ghost = &ARC_mru_ghost;
3775 arc_mfu = &ARC_mfu;
3776 arc_mfu_ghost = &ARC_mfu_ghost;
3777 arc_l2c_only = &ARC_l2c_only;
3778 arc_size = 0;
3779
3780 mutex_init(&arc_anon->arcs_mtx, NULL, MUTEX_DEFAULT, NULL);
3781 mutex_init(&arc_mru->arcs_mtx, NULL, MUTEX_DEFAULT, NULL);
3782 mutex_init(&arc_mru_ghost->arcs_mtx, NULL, MUTEX_DEFAULT, NULL);
3783 mutex_init(&arc_mfu->arcs_mtx, NULL, MUTEX_DEFAULT, NULL);
3784 mutex_init(&arc_mfu_ghost->arcs_mtx, NULL, MUTEX_DEFAULT, NULL);
3785 mutex_init(&arc_l2c_only->arcs_mtx, NULL, MUTEX_DEFAULT, NULL);
3786
3787 list_create(&arc_mru->arcs_list[ARC_BUFC_METADATA],
3788 sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
3789 list_create(&arc_mru->arcs_list[ARC_BUFC_DATA],
3790 sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
3791 list_create(&arc_mru_ghost->arcs_list[ARC_BUFC_METADATA],
3792 sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
3793 list_create(&arc_mru_ghost->arcs_list[ARC_BUFC_DATA],
3794 sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
3795 list_create(&arc_mfu->arcs_list[ARC_BUFC_METADATA],
3796 sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
3797 list_create(&arc_mfu->arcs_list[ARC_BUFC_DATA],
3798 sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
3799 list_create(&arc_mfu_ghost->arcs_list[ARC_BUFC_METADATA],
3800 sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
3801 list_create(&arc_mfu_ghost->arcs_list[ARC_BUFC_DATA],
3802 sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
3803 list_create(&arc_l2c_only->arcs_list[ARC_BUFC_METADATA],
3804 sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
3805 list_create(&arc_l2c_only->arcs_list[ARC_BUFC_DATA],
3806 sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
3807
3808 buf_init();
3809
3810 arc_thread_exit = 0;
ab26409d
BB
3811 list_create(&arc_prune_list, sizeof (arc_prune_t),
3812 offsetof(arc_prune_t, p_node));
34dc7c2f 3813 arc_eviction_list = NULL;
ab26409d 3814 mutex_init(&arc_prune_mtx, NULL, MUTEX_DEFAULT, NULL);
34dc7c2f
BB
3815 mutex_init(&arc_eviction_mtx, NULL, MUTEX_DEFAULT, NULL);
3816 bzero(&arc_eviction_hdr, sizeof (arc_buf_hdr_t));
3817
3818 arc_ksp = kstat_create("zfs", 0, "arcstats", "misc", KSTAT_TYPE_NAMED,
3819 sizeof (arc_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL);
3820
3821 if (arc_ksp != NULL) {
3822 arc_ksp->ks_data = &arc_stats;
13be560d 3823 arc_ksp->ks_update = arc_kstat_update;
34dc7c2f
BB
3824 kstat_install(arc_ksp);
3825 }
3826
302f753f 3827 (void) thread_create(NULL, 0, arc_adapt_thread, NULL, 0, &p0,
34dc7c2f
BB
3828 TS_RUN, minclsyspri);
3829
3830 arc_dead = FALSE;
b128c09f 3831 arc_warm = B_FALSE;
34dc7c2f
BB
3832
3833 if (zfs_write_limit_max == 0)
b128c09f 3834 zfs_write_limit_max = ptob(physmem) >> zfs_write_limit_shift;
34dc7c2f
BB
3835 else
3836 zfs_write_limit_shift = 0;
b128c09f 3837 mutex_init(&zfs_write_limit_lock, NULL, MUTEX_DEFAULT, NULL);
34dc7c2f
BB
3838}
3839
3840void
3841arc_fini(void)
3842{
ab26409d
BB
3843 arc_prune_t *p;
3844
34dc7c2f 3845 mutex_enter(&arc_reclaim_thr_lock);
7cb67b45
BB
3846#ifdef _KERNEL
3847 spl_unregister_shrinker(&arc_shrinker);
3848#endif /* _KERNEL */
3849
34dc7c2f
BB
3850 arc_thread_exit = 1;
3851 while (arc_thread_exit != 0)
3852 cv_wait(&arc_reclaim_thr_cv, &arc_reclaim_thr_lock);
3853 mutex_exit(&arc_reclaim_thr_lock);
3854
3855 arc_flush(NULL);
3856
3857 arc_dead = TRUE;
3858
3859 if (arc_ksp != NULL) {
3860 kstat_delete(arc_ksp);
3861 arc_ksp = NULL;
3862 }
3863
ab26409d
BB
3864 mutex_enter(&arc_prune_mtx);
3865 while ((p = list_head(&arc_prune_list)) != NULL) {
3866 list_remove(&arc_prune_list, p);
3867 refcount_remove(&p->p_refcnt, &arc_prune_list);
3868 refcount_destroy(&p->p_refcnt);
3869 kmem_free(p, sizeof (*p));
3870 }
3871 mutex_exit(&arc_prune_mtx);
3872
3873 list_destroy(&arc_prune_list);
3874 mutex_destroy(&arc_prune_mtx);
34dc7c2f
BB
3875 mutex_destroy(&arc_eviction_mtx);
3876 mutex_destroy(&arc_reclaim_thr_lock);
3877 cv_destroy(&arc_reclaim_thr_cv);
3878
3879 list_destroy(&arc_mru->arcs_list[ARC_BUFC_METADATA]);
3880 list_destroy(&arc_mru_ghost->arcs_list[ARC_BUFC_METADATA]);
3881 list_destroy(&arc_mfu->arcs_list[ARC_BUFC_METADATA]);
3882 list_destroy(&arc_mfu_ghost->arcs_list[ARC_BUFC_METADATA]);
3883 list_destroy(&arc_mru->arcs_list[ARC_BUFC_DATA]);
3884 list_destroy(&arc_mru_ghost->arcs_list[ARC_BUFC_DATA]);
3885 list_destroy(&arc_mfu->arcs_list[ARC_BUFC_DATA]);
3886 list_destroy(&arc_mfu_ghost->arcs_list[ARC_BUFC_DATA]);
3887
3888 mutex_destroy(&arc_anon->arcs_mtx);
3889 mutex_destroy(&arc_mru->arcs_mtx);
3890 mutex_destroy(&arc_mru_ghost->arcs_mtx);
3891 mutex_destroy(&arc_mfu->arcs_mtx);
3892 mutex_destroy(&arc_mfu_ghost->arcs_mtx);
fb5f0bc8 3893 mutex_destroy(&arc_l2c_only->arcs_mtx);
34dc7c2f 3894
b128c09f
BB
3895 mutex_destroy(&zfs_write_limit_lock);
3896
34dc7c2f 3897 buf_fini();
9babb374
BB
3898
3899 ASSERT(arc_loaned_bytes == 0);
34dc7c2f
BB
3900}
3901
3902/*
3903 * Level 2 ARC
3904 *
3905 * The level 2 ARC (L2ARC) is a cache layer in-between main memory and disk.
3906 * It uses dedicated storage devices to hold cached data, which are populated
3907 * using large infrequent writes. The main role of this cache is to boost
3908 * the performance of random read workloads. The intended L2ARC devices
3909 * include short-stroked disks, solid state disks, and other media with
3910 * substantially faster read latency than disk.
3911 *
3912 * +-----------------------+
3913 * | ARC |
3914 * +-----------------------+
3915 * | ^ ^
3916 * | | |
3917 * l2arc_feed_thread() arc_read()
3918 * | | |
3919 * | l2arc read |
3920 * V | |
3921 * +---------------+ |
3922 * | L2ARC | |
3923 * +---------------+ |
3924 * | ^ |
3925 * l2arc_write() | |
3926 * | | |
3927 * V | |
3928 * +-------+ +-------+
3929 * | vdev | | vdev |
3930 * | cache | | cache |
3931 * +-------+ +-------+
3932 * +=========+ .-----.
3933 * : L2ARC : |-_____-|
3934 * : devices : | Disks |
3935 * +=========+ `-_____-'
3936 *
3937 * Read requests are satisfied from the following sources, in order:
3938 *
3939 * 1) ARC
3940 * 2) vdev cache of L2ARC devices
3941 * 3) L2ARC devices
3942 * 4) vdev cache of disks
3943 * 5) disks
3944 *
3945 * Some L2ARC device types exhibit extremely slow write performance.
3946 * To accommodate for this there are some significant differences between
3947 * the L2ARC and traditional cache design:
3948 *
3949 * 1. There is no eviction path from the ARC to the L2ARC. Evictions from
3950 * the ARC behave as usual, freeing buffers and placing headers on ghost
3951 * lists. The ARC does not send buffers to the L2ARC during eviction as
3952 * this would add inflated write latencies for all ARC memory pressure.
3953 *
3954 * 2. The L2ARC attempts to cache data from the ARC before it is evicted.
3955 * It does this by periodically scanning buffers from the eviction-end of
3956 * the MFU and MRU ARC lists, copying them to the L2ARC devices if they are
3957 * not already there. It scans until a headroom of buffers is satisfied,
3958 * which itself is a buffer for ARC eviction. The thread that does this is
3959 * l2arc_feed_thread(), illustrated below; example sizes are included to
3960 * provide a better sense of ratio than this diagram:
3961 *
3962 * head --> tail
3963 * +---------------------+----------+
3964 * ARC_mfu |:::::#:::::::::::::::|o#o###o###|-->. # already on L2ARC
3965 * +---------------------+----------+ | o L2ARC eligible
3966 * ARC_mru |:#:::::::::::::::::::|#o#ooo####|-->| : ARC buffer
3967 * +---------------------+----------+ |
3968 * 15.9 Gbytes ^ 32 Mbytes |
3969 * headroom |
3970 * l2arc_feed_thread()
3971 * |
3972 * l2arc write hand <--[oooo]--'
3973 * | 8 Mbyte
3974 * | write max
3975 * V
3976 * +==============================+
3977 * L2ARC dev |####|#|###|###| |####| ... |
3978 * +==============================+
3979 * 32 Gbytes
3980 *
3981 * 3. If an ARC buffer is copied to the L2ARC but then hit instead of
3982 * evicted, then the L2ARC has cached a buffer much sooner than it probably
3983 * needed to, potentially wasting L2ARC device bandwidth and storage. It is
3984 * safe to say that this is an uncommon case, since buffers at the end of
3985 * the ARC lists have moved there due to inactivity.
3986 *
3987 * 4. If the ARC evicts faster than the L2ARC can maintain a headroom,
3988 * then the L2ARC simply misses copying some buffers. This serves as a
3989 * pressure valve to prevent heavy read workloads from both stalling the ARC
3990 * with waits and clogging the L2ARC with writes. This also helps prevent
3991 * the potential for the L2ARC to churn if it attempts to cache content too
3992 * quickly, such as during backups of the entire pool.
3993 *
b128c09f
BB
3994 * 5. After system boot and before the ARC has filled main memory, there are
3995 * no evictions from the ARC and so the tails of the ARC_mfu and ARC_mru
3996 * lists can remain mostly static. Instead of searching from tail of these
3997 * lists as pictured, the l2arc_feed_thread() will search from the list heads
3998 * for eligible buffers, greatly increasing its chance of finding them.
3999 *
4000 * The L2ARC device write speed is also boosted during this time so that
4001 * the L2ARC warms up faster. Since there have been no ARC evictions yet,
4002 * there are no L2ARC reads, and no fear of degrading read performance
4003 * through increased writes.
4004 *
4005 * 6. Writes to the L2ARC devices are grouped and sent in-sequence, so that
34dc7c2f
BB
4006 * the vdev queue can aggregate them into larger and fewer writes. Each
4007 * device is written to in a rotor fashion, sweeping writes through
4008 * available space then repeating.
4009 *
b128c09f 4010 * 7. The L2ARC does not store dirty content. It never needs to flush
34dc7c2f
BB
4011 * write buffers back to disk based storage.
4012 *
b128c09f 4013 * 8. If an ARC buffer is written (and dirtied) which also exists in the
34dc7c2f
BB
4014 * L2ARC, the now stale L2ARC buffer is immediately dropped.
4015 *
4016 * The performance of the L2ARC can be tweaked by a number of tunables, which
4017 * may be necessary for different workloads:
4018 *
4019 * l2arc_write_max max write bytes per interval
b128c09f 4020 * l2arc_write_boost extra write bytes during device warmup
34dc7c2f
BB
4021 * l2arc_noprefetch skip caching prefetched buffers
4022 * l2arc_headroom number of max device writes to precache
4023 * l2arc_feed_secs seconds between L2ARC writing
4024 *
4025 * Tunables may be removed or added as future performance improvements are
4026 * integrated, and also may become zpool properties.
d164b209
BB
4027 *
4028 * There are three key functions that control how the L2ARC warms up:
4029 *
4030 * l2arc_write_eligible() check if a buffer is eligible to cache
4031 * l2arc_write_size() calculate how much to write
4032 * l2arc_write_interval() calculate sleep delay between writes
4033 *
4034 * These three functions determine what to write, how much, and how quickly
4035 * to send writes.
34dc7c2f
BB
4036 */
4037
d164b209
BB
4038static boolean_t
4039l2arc_write_eligible(uint64_t spa_guid, arc_buf_hdr_t *ab)
4040{
4041 /*
4042 * A buffer is *not* eligible for the L2ARC if it:
4043 * 1. belongs to a different spa.
428870ff
BB
4044 * 2. is already cached on the L2ARC.
4045 * 3. has an I/O in progress (it may be an incomplete read).
4046 * 4. is flagged not eligible (zfs property).
d164b209 4047 */
428870ff 4048 if (ab->b_spa != spa_guid || ab->b_l2hdr != NULL ||
d164b209
BB
4049 HDR_IO_IN_PROGRESS(ab) || !HDR_L2CACHE(ab))
4050 return (B_FALSE);
4051
4052 return (B_TRUE);
4053}
4054
4055static uint64_t
4056l2arc_write_size(l2arc_dev_t *dev)
4057{
4058 uint64_t size;
4059
4060 size = dev->l2ad_write;
4061
4062 if (arc_warm == B_FALSE)
4063 size += dev->l2ad_boost;
4064
4065 return (size);
4066
4067}
4068
4069static clock_t
4070l2arc_write_interval(clock_t began, uint64_t wanted, uint64_t wrote)
4071{
428870ff 4072 clock_t interval, next, now;
d164b209
BB
4073
4074 /*
4075 * If the ARC lists are busy, increase our write rate; if the
4076 * lists are stale, idle back. This is achieved by checking
4077 * how much we previously wrote - if it was more than half of
4078 * what we wanted, schedule the next write much sooner.
4079 */
4080 if (l2arc_feed_again && wrote > (wanted / 2))
4081 interval = (hz * l2arc_feed_min_ms) / 1000;
4082 else
4083 interval = hz * l2arc_feed_secs;
4084
428870ff
BB
4085 now = ddi_get_lbolt();
4086 next = MAX(now, MIN(now + interval, began + interval));
d164b209
BB
4087
4088 return (next);
4089}
4090
34dc7c2f
BB
4091static void
4092l2arc_hdr_stat_add(void)
4093{
4094 ARCSTAT_INCR(arcstat_l2_hdr_size, HDR_SIZE + L2HDR_SIZE);
4095 ARCSTAT_INCR(arcstat_hdr_size, -HDR_SIZE);
4096}
4097
4098static void
4099l2arc_hdr_stat_remove(void)
4100{
4101 ARCSTAT_INCR(arcstat_l2_hdr_size, -(HDR_SIZE + L2HDR_SIZE));
4102 ARCSTAT_INCR(arcstat_hdr_size, HDR_SIZE);
4103}
4104
4105/*
4106 * Cycle through L2ARC devices. This is how L2ARC load balances.
b128c09f 4107 * If a device is returned, this also returns holding the spa config lock.
34dc7c2f
BB
4108 */
4109static l2arc_dev_t *
4110l2arc_dev_get_next(void)
4111{
b128c09f 4112 l2arc_dev_t *first, *next = NULL;
34dc7c2f 4113
b128c09f
BB
4114 /*
4115 * Lock out the removal of spas (spa_namespace_lock), then removal
4116 * of cache devices (l2arc_dev_mtx). Once a device has been selected,
4117 * both locks will be dropped and a spa config lock held instead.
4118 */
4119 mutex_enter(&spa_namespace_lock);
4120 mutex_enter(&l2arc_dev_mtx);
4121
4122 /* if there are no vdevs, there is nothing to do */
4123 if (l2arc_ndev == 0)
4124 goto out;
4125
4126 first = NULL;
4127 next = l2arc_dev_last;
4128 do {
4129 /* loop around the list looking for a non-faulted vdev */
4130 if (next == NULL) {
34dc7c2f 4131 next = list_head(l2arc_dev_list);
b128c09f
BB
4132 } else {
4133 next = list_next(l2arc_dev_list, next);
4134 if (next == NULL)
4135 next = list_head(l2arc_dev_list);
4136 }
4137
4138 /* if we have come back to the start, bail out */
4139 if (first == NULL)
4140 first = next;
4141 else if (next == first)
4142 break;
4143
4144 } while (vdev_is_dead(next->l2ad_vdev));
4145
4146 /* if we were unable to find any usable vdevs, return NULL */
4147 if (vdev_is_dead(next->l2ad_vdev))
4148 next = NULL;
34dc7c2f
BB
4149
4150 l2arc_dev_last = next;
4151
b128c09f
BB
4152out:
4153 mutex_exit(&l2arc_dev_mtx);
4154
4155 /*
4156 * Grab the config lock to prevent the 'next' device from being
4157 * removed while we are writing to it.
4158 */
4159 if (next != NULL)
4160 spa_config_enter(next->l2ad_spa, SCL_L2ARC, next, RW_READER);
4161 mutex_exit(&spa_namespace_lock);
4162
34dc7c2f
BB
4163 return (next);
4164}
4165
b128c09f
BB
4166/*
4167 * Free buffers that were tagged for destruction.
4168 */
4169static void
0bc8fd78 4170l2arc_do_free_on_write(void)
b128c09f
BB
4171{
4172 list_t *buflist;
4173 l2arc_data_free_t *df, *df_prev;
4174
4175 mutex_enter(&l2arc_free_on_write_mtx);
4176 buflist = l2arc_free_on_write;
4177
4178 for (df = list_tail(buflist); df; df = df_prev) {
4179 df_prev = list_prev(buflist, df);
4180 ASSERT(df->l2df_data != NULL);
4181 ASSERT(df->l2df_func != NULL);
4182 df->l2df_func(df->l2df_data, df->l2df_size);
4183 list_remove(buflist, df);
4184 kmem_free(df, sizeof (l2arc_data_free_t));
4185 }
4186
4187 mutex_exit(&l2arc_free_on_write_mtx);
4188}
4189
34dc7c2f
BB
4190/*
4191 * A write to a cache device has completed. Update all headers to allow
4192 * reads from these buffers to begin.
4193 */
4194static void
4195l2arc_write_done(zio_t *zio)
4196{
4197 l2arc_write_callback_t *cb;
4198 l2arc_dev_t *dev;
4199 list_t *buflist;
34dc7c2f 4200 arc_buf_hdr_t *head, *ab, *ab_prev;
b128c09f 4201 l2arc_buf_hdr_t *abl2;
34dc7c2f
BB
4202 kmutex_t *hash_lock;
4203
4204 cb = zio->io_private;
4205 ASSERT(cb != NULL);
4206 dev = cb->l2wcb_dev;
4207 ASSERT(dev != NULL);
4208 head = cb->l2wcb_head;
4209 ASSERT(head != NULL);
4210 buflist = dev->l2ad_buflist;
4211 ASSERT(buflist != NULL);
4212 DTRACE_PROBE2(l2arc__iodone, zio_t *, zio,
4213 l2arc_write_callback_t *, cb);
4214
4215 if (zio->io_error != 0)
4216 ARCSTAT_BUMP(arcstat_l2_writes_error);
4217
4218 mutex_enter(&l2arc_buflist_mtx);
4219
4220 /*
4221 * All writes completed, or an error was hit.
4222 */
4223 for (ab = list_prev(buflist, head); ab; ab = ab_prev) {
4224 ab_prev = list_prev(buflist, ab);
4225
4226 hash_lock = HDR_LOCK(ab);
4227 if (!mutex_tryenter(hash_lock)) {
4228 /*
4229 * This buffer misses out. It may be in a stage
4230 * of eviction. Its ARC_L2_WRITING flag will be
4231 * left set, denying reads to this buffer.
4232 */
4233 ARCSTAT_BUMP(arcstat_l2_writes_hdr_miss);
4234 continue;
4235 }
4236
4237 if (zio->io_error != 0) {
4238 /*
b128c09f 4239 * Error - drop L2ARC entry.
34dc7c2f 4240 */
b128c09f
BB
4241 list_remove(buflist, ab);
4242 abl2 = ab->b_l2hdr;
34dc7c2f 4243 ab->b_l2hdr = NULL;
b128c09f
BB
4244 kmem_free(abl2, sizeof (l2arc_buf_hdr_t));
4245 ARCSTAT_INCR(arcstat_l2_size, -ab->b_size);
34dc7c2f
BB
4246 }
4247
4248 /*
4249 * Allow ARC to begin reads to this L2ARC entry.
4250 */
4251 ab->b_flags &= ~ARC_L2_WRITING;
4252
4253 mutex_exit(hash_lock);
4254 }
4255
4256 atomic_inc_64(&l2arc_writes_done);
4257 list_remove(buflist, head);
4258 kmem_cache_free(hdr_cache, head);
4259 mutex_exit(&l2arc_buflist_mtx);
4260
b128c09f 4261 l2arc_do_free_on_write();
34dc7c2f
BB
4262
4263 kmem_free(cb, sizeof (l2arc_write_callback_t));
4264}
4265
4266/*
4267 * A read to a cache device completed. Validate buffer contents before
4268 * handing over to the regular ARC routines.
4269 */
4270static void
4271l2arc_read_done(zio_t *zio)
4272{
4273 l2arc_read_callback_t *cb;
4274 arc_buf_hdr_t *hdr;
4275 arc_buf_t *buf;
34dc7c2f 4276 kmutex_t *hash_lock;
b128c09f
BB
4277 int equal;
4278
4279 ASSERT(zio->io_vd != NULL);
4280 ASSERT(zio->io_flags & ZIO_FLAG_DONT_PROPAGATE);
4281
4282 spa_config_exit(zio->io_spa, SCL_L2ARC, zio->io_vd);
34dc7c2f
BB
4283
4284 cb = zio->io_private;
4285 ASSERT(cb != NULL);
4286 buf = cb->l2rcb_buf;
4287 ASSERT(buf != NULL);
34dc7c2f 4288
428870ff 4289 hash_lock = HDR_LOCK(buf->b_hdr);
34dc7c2f 4290 mutex_enter(hash_lock);
428870ff
BB
4291 hdr = buf->b_hdr;
4292 ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
34dc7c2f
BB
4293
4294 /*
4295 * Check this survived the L2ARC journey.
4296 */
4297 equal = arc_cksum_equal(buf);
4298 if (equal && zio->io_error == 0 && !HDR_L2_EVICTED(hdr)) {
4299 mutex_exit(hash_lock);
4300 zio->io_private = buf;
b128c09f
BB
4301 zio->io_bp_copy = cb->l2rcb_bp; /* XXX fix in L2ARC 2.0 */
4302 zio->io_bp = &zio->io_bp_copy; /* XXX fix in L2ARC 2.0 */
34dc7c2f
BB
4303 arc_read_done(zio);
4304 } else {
4305 mutex_exit(hash_lock);
4306 /*
4307 * Buffer didn't survive caching. Increment stats and
4308 * reissue to the original storage device.
4309 */
b128c09f 4310 if (zio->io_error != 0) {
34dc7c2f 4311 ARCSTAT_BUMP(arcstat_l2_io_error);
b128c09f
BB
4312 } else {
4313 zio->io_error = EIO;
4314 }
34dc7c2f
BB
4315 if (!equal)
4316 ARCSTAT_BUMP(arcstat_l2_cksum_bad);
4317
34dc7c2f 4318 /*
b128c09f
BB
4319 * If there's no waiter, issue an async i/o to the primary
4320 * storage now. If there *is* a waiter, the caller must
4321 * issue the i/o in a context where it's OK to block.
34dc7c2f 4322 */
d164b209
BB
4323 if (zio->io_waiter == NULL) {
4324 zio_t *pio = zio_unique_parent(zio);
4325
4326 ASSERT(!pio || pio->io_child_type == ZIO_CHILD_LOGICAL);
4327
4328 zio_nowait(zio_read(pio, cb->l2rcb_spa, &cb->l2rcb_bp,
b128c09f
BB
4329 buf->b_data, zio->io_size, arc_read_done, buf,
4330 zio->io_priority, cb->l2rcb_flags, &cb->l2rcb_zb));
d164b209 4331 }
34dc7c2f
BB
4332 }
4333
4334 kmem_free(cb, sizeof (l2arc_read_callback_t));
4335}
4336
4337/*
4338 * This is the list priority from which the L2ARC will search for pages to
4339 * cache. This is used within loops (0..3) to cycle through lists in the
4340 * desired order. This order can have a significant effect on cache
4341 * performance.
4342 *
4343 * Currently the metadata lists are hit first, MFU then MRU, followed by
4344 * the data lists. This function returns a locked list, and also returns
4345 * the lock pointer.
4346 */
4347static list_t *
4348l2arc_list_locked(int list_num, kmutex_t **lock)
4349{
d4ed6673 4350 list_t *list = NULL;
34dc7c2f
BB
4351
4352 ASSERT(list_num >= 0 && list_num <= 3);
4353
4354 switch (list_num) {
4355 case 0:
4356 list = &arc_mfu->arcs_list[ARC_BUFC_METADATA];
4357 *lock = &arc_mfu->arcs_mtx;
4358 break;
4359 case 1:
4360 list = &arc_mru->arcs_list[ARC_BUFC_METADATA];
4361 *lock = &arc_mru->arcs_mtx;
4362 break;
4363 case 2:
4364 list = &arc_mfu->arcs_list[ARC_BUFC_DATA];
4365 *lock = &arc_mfu->arcs_mtx;
4366 break;
4367 case 3:
4368 list = &arc_mru->arcs_list[ARC_BUFC_DATA];
4369 *lock = &arc_mru->arcs_mtx;
4370 break;
4371 }
4372
4373 ASSERT(!(MUTEX_HELD(*lock)));
4374 mutex_enter(*lock);
4375 return (list);
4376}
4377
4378/*
4379 * Evict buffers from the device write hand to the distance specified in
4380 * bytes. This distance may span populated buffers, it may span nothing.
4381 * This is clearing a region on the L2ARC device ready for writing.
4382 * If the 'all' boolean is set, every buffer is evicted.
4383 */
4384static void
4385l2arc_evict(l2arc_dev_t *dev, uint64_t distance, boolean_t all)
4386{
4387 list_t *buflist;
4388 l2arc_buf_hdr_t *abl2;
4389 arc_buf_hdr_t *ab, *ab_prev;
4390 kmutex_t *hash_lock;
4391 uint64_t taddr;
4392
34dc7c2f
BB
4393 buflist = dev->l2ad_buflist;
4394
4395 if (buflist == NULL)
4396 return;
4397
4398 if (!all && dev->l2ad_first) {
4399 /*
4400 * This is the first sweep through the device. There is
4401 * nothing to evict.
4402 */
4403 return;
4404 }
4405
b128c09f 4406 if (dev->l2ad_hand >= (dev->l2ad_end - (2 * distance))) {
34dc7c2f
BB
4407 /*
4408 * When nearing the end of the device, evict to the end
4409 * before the device write hand jumps to the start.
4410 */
4411 taddr = dev->l2ad_end;
4412 } else {
4413 taddr = dev->l2ad_hand + distance;
4414 }
4415 DTRACE_PROBE4(l2arc__evict, l2arc_dev_t *, dev, list_t *, buflist,
4416 uint64_t, taddr, boolean_t, all);
4417
4418top:
4419 mutex_enter(&l2arc_buflist_mtx);
4420 for (ab = list_tail(buflist); ab; ab = ab_prev) {
4421 ab_prev = list_prev(buflist, ab);
4422
4423 hash_lock = HDR_LOCK(ab);
4424 if (!mutex_tryenter(hash_lock)) {
4425 /*
4426 * Missed the hash lock. Retry.
4427 */
4428 ARCSTAT_BUMP(arcstat_l2_evict_lock_retry);
4429 mutex_exit(&l2arc_buflist_mtx);
4430 mutex_enter(hash_lock);
4431 mutex_exit(hash_lock);
4432 goto top;
4433 }
4434
4435 if (HDR_L2_WRITE_HEAD(ab)) {
4436 /*
4437 * We hit a write head node. Leave it for
4438 * l2arc_write_done().
4439 */
4440 list_remove(buflist, ab);
4441 mutex_exit(hash_lock);
4442 continue;
4443 }
4444
4445 if (!all && ab->b_l2hdr != NULL &&
4446 (ab->b_l2hdr->b_daddr > taddr ||
4447 ab->b_l2hdr->b_daddr < dev->l2ad_hand)) {
4448 /*
4449 * We've evicted to the target address,
4450 * or the end of the device.
4451 */
4452 mutex_exit(hash_lock);
4453 break;
4454 }
4455
4456 if (HDR_FREE_IN_PROGRESS(ab)) {
4457 /*
4458 * Already on the path to destruction.
4459 */
4460 mutex_exit(hash_lock);
4461 continue;
4462 }
4463
4464 if (ab->b_state == arc_l2c_only) {
4465 ASSERT(!HDR_L2_READING(ab));
4466 /*
4467 * This doesn't exist in the ARC. Destroy.
4468 * arc_hdr_destroy() will call list_remove()
4469 * and decrement arcstat_l2_size.
4470 */
4471 arc_change_state(arc_anon, ab, hash_lock);
4472 arc_hdr_destroy(ab);
4473 } else {
b128c09f
BB
4474 /*
4475 * Invalidate issued or about to be issued
4476 * reads, since we may be about to write
4477 * over this location.
4478 */
4479 if (HDR_L2_READING(ab)) {
4480 ARCSTAT_BUMP(arcstat_l2_evict_reading);
4481 ab->b_flags |= ARC_L2_EVICTED;
4482 }
4483
34dc7c2f
BB
4484 /*
4485 * Tell ARC this no longer exists in L2ARC.
4486 */
4487 if (ab->b_l2hdr != NULL) {
4488 abl2 = ab->b_l2hdr;
4489 ab->b_l2hdr = NULL;
4490 kmem_free(abl2, sizeof (l2arc_buf_hdr_t));
4491 ARCSTAT_INCR(arcstat_l2_size, -ab->b_size);
4492 }
4493 list_remove(buflist, ab);
4494
4495 /*
4496 * This may have been leftover after a
4497 * failed write.
4498 */
4499 ab->b_flags &= ~ARC_L2_WRITING;
34dc7c2f
BB
4500 }
4501 mutex_exit(hash_lock);
4502 }
4503 mutex_exit(&l2arc_buflist_mtx);
4504
428870ff 4505 vdev_space_update(dev->l2ad_vdev, -(taddr - dev->l2ad_evict), 0, 0);
34dc7c2f
BB
4506 dev->l2ad_evict = taddr;
4507}
4508
4509/*
4510 * Find and write ARC buffers to the L2ARC device.
4511 *
4512 * An ARC_L2_WRITING flag is set so that the L2ARC buffers are not valid
4513 * for reading until they have completed writing.
4514 */
d164b209 4515static uint64_t
b128c09f 4516l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz)
34dc7c2f
BB
4517{
4518 arc_buf_hdr_t *ab, *ab_prev, *head;
4519 l2arc_buf_hdr_t *hdrl2;
4520 list_t *list;
b128c09f 4521 uint64_t passed_sz, write_sz, buf_sz, headroom;
34dc7c2f 4522 void *buf_data;
d4ed6673 4523 kmutex_t *hash_lock, *list_lock = NULL;
34dc7c2f
BB
4524 boolean_t have_lock, full;
4525 l2arc_write_callback_t *cb;
4526 zio_t *pio, *wzio;
3541dc6d 4527 uint64_t guid = spa_load_guid(spa);
d6320ddb 4528 int try;
34dc7c2f 4529
34dc7c2f
BB
4530 ASSERT(dev->l2ad_vdev != NULL);
4531
4532 pio = NULL;
4533 write_sz = 0;
4534 full = B_FALSE;
4535 head = kmem_cache_alloc(hdr_cache, KM_PUSHPAGE);
4536 head->b_flags |= ARC_L2_WRITE_HEAD;
4537
4538 /*
4539 * Copy buffers for L2ARC writing.
4540 */
4541 mutex_enter(&l2arc_buflist_mtx);
d6320ddb 4542 for (try = 0; try <= 3; try++) {
34dc7c2f
BB
4543 list = l2arc_list_locked(try, &list_lock);
4544 passed_sz = 0;
4545
b128c09f
BB
4546 /*
4547 * L2ARC fast warmup.
4548 *
4549 * Until the ARC is warm and starts to evict, read from the
4550 * head of the ARC lists rather than the tail.
4551 */
4552 headroom = target_sz * l2arc_headroom;
4553 if (arc_warm == B_FALSE)
4554 ab = list_head(list);
4555 else
4556 ab = list_tail(list);
4557
4558 for (; ab; ab = ab_prev) {
4559 if (arc_warm == B_FALSE)
4560 ab_prev = list_next(list, ab);
4561 else
4562 ab_prev = list_prev(list, ab);
34dc7c2f
BB
4563
4564 hash_lock = HDR_LOCK(ab);
4565 have_lock = MUTEX_HELD(hash_lock);
4566 if (!have_lock && !mutex_tryenter(hash_lock)) {
4567 /*
4568 * Skip this buffer rather than waiting.
4569 */
4570 continue;
4571 }
4572
4573 passed_sz += ab->b_size;
4574 if (passed_sz > headroom) {
4575 /*
4576 * Searched too far.
4577 */
4578 mutex_exit(hash_lock);
4579 break;
4580 }
4581
d164b209 4582 if (!l2arc_write_eligible(guid, ab)) {
34dc7c2f
BB
4583 mutex_exit(hash_lock);
4584 continue;
4585 }
4586
4587 if ((write_sz + ab->b_size) > target_sz) {
4588 full = B_TRUE;
4589 mutex_exit(hash_lock);
4590 break;
4591 }
4592
34dc7c2f
BB
4593 if (pio == NULL) {
4594 /*
4595 * Insert a dummy header on the buflist so
4596 * l2arc_write_done() can find where the
4597 * write buffers begin without searching.
4598 */
4599 list_insert_head(dev->l2ad_buflist, head);
4600
409dc1a5
PS
4601 cb = kmem_alloc(sizeof (l2arc_write_callback_t),
4602 KM_PUSHPAGE);
34dc7c2f
BB
4603 cb->l2wcb_dev = dev;
4604 cb->l2wcb_head = head;
4605 pio = zio_root(spa, l2arc_write_done, cb,
4606 ZIO_FLAG_CANFAIL);
4607 }
4608
4609 /*
4610 * Create and add a new L2ARC header.
4611 */
409dc1a5
PS
4612 hdrl2 = kmem_zalloc(sizeof (l2arc_buf_hdr_t),
4613 KM_PUSHPAGE);
34dc7c2f
BB
4614 hdrl2->b_dev = dev;
4615 hdrl2->b_daddr = dev->l2ad_hand;
4616
4617 ab->b_flags |= ARC_L2_WRITING;
4618 ab->b_l2hdr = hdrl2;
4619 list_insert_head(dev->l2ad_buflist, ab);
4620 buf_data = ab->b_buf->b_data;
4621 buf_sz = ab->b_size;
4622
4623 /*
4624 * Compute and store the buffer cksum before
4625 * writing. On debug the cksum is verified first.
4626 */
4627 arc_cksum_verify(ab->b_buf);
4628 arc_cksum_compute(ab->b_buf, B_TRUE);
4629
4630 mutex_exit(hash_lock);
4631
4632 wzio = zio_write_phys(pio, dev->l2ad_vdev,
4633 dev->l2ad_hand, buf_sz, buf_data, ZIO_CHECKSUM_OFF,
4634 NULL, NULL, ZIO_PRIORITY_ASYNC_WRITE,
4635 ZIO_FLAG_CANFAIL, B_FALSE);
4636
4637 DTRACE_PROBE2(l2arc__write, vdev_t *, dev->l2ad_vdev,
4638 zio_t *, wzio);
4639 (void) zio_nowait(wzio);
4640
b128c09f
BB
4641 /*
4642 * Keep the clock hand suitably device-aligned.
4643 */
4644 buf_sz = vdev_psize_to_asize(dev->l2ad_vdev, buf_sz);
4645
34dc7c2f
BB
4646 write_sz += buf_sz;
4647 dev->l2ad_hand += buf_sz;
4648 }
4649
4650 mutex_exit(list_lock);
4651
4652 if (full == B_TRUE)
4653 break;
4654 }
4655 mutex_exit(&l2arc_buflist_mtx);
4656
4657 if (pio == NULL) {
4658 ASSERT3U(write_sz, ==, 0);
4659 kmem_cache_free(hdr_cache, head);
d164b209 4660 return (0);
34dc7c2f
BB
4661 }
4662
4663 ASSERT3U(write_sz, <=, target_sz);
4664 ARCSTAT_BUMP(arcstat_l2_writes_sent);
d164b209 4665 ARCSTAT_INCR(arcstat_l2_write_bytes, write_sz);
34dc7c2f 4666 ARCSTAT_INCR(arcstat_l2_size, write_sz);
428870ff 4667 vdev_space_update(dev->l2ad_vdev, write_sz, 0, 0);
34dc7c2f
BB
4668
4669 /*
4670 * Bump device hand to the device start if it is approaching the end.
4671 * l2arc_evict() will already have evicted ahead for this case.
4672 */
b128c09f 4673 if (dev->l2ad_hand >= (dev->l2ad_end - target_sz)) {
428870ff
BB
4674 vdev_space_update(dev->l2ad_vdev,
4675 dev->l2ad_end - dev->l2ad_hand, 0, 0);
34dc7c2f
BB
4676 dev->l2ad_hand = dev->l2ad_start;
4677 dev->l2ad_evict = dev->l2ad_start;
4678 dev->l2ad_first = B_FALSE;
4679 }
4680
d164b209 4681 dev->l2ad_writing = B_TRUE;
34dc7c2f 4682 (void) zio_wait(pio);
d164b209
BB
4683 dev->l2ad_writing = B_FALSE;
4684
4685 return (write_sz);
34dc7c2f
BB
4686}
4687
4688/*
4689 * This thread feeds the L2ARC at regular intervals. This is the beating
4690 * heart of the L2ARC.
4691 */
4692static void
4693l2arc_feed_thread(void)
4694{
4695 callb_cpr_t cpr;
4696 l2arc_dev_t *dev;
4697 spa_t *spa;
d164b209 4698 uint64_t size, wrote;
428870ff 4699 clock_t begin, next = ddi_get_lbolt();
34dc7c2f
BB
4700
4701 CALLB_CPR_INIT(&cpr, &l2arc_feed_thr_lock, callb_generic_cpr, FTAG);
4702
4703 mutex_enter(&l2arc_feed_thr_lock);
4704
4705 while (l2arc_thread_exit == 0) {
34dc7c2f 4706 CALLB_CPR_SAFE_BEGIN(&cpr);
5b63b3eb
BB
4707 (void) cv_timedwait_interruptible(&l2arc_feed_thr_cv,
4708 &l2arc_feed_thr_lock, next);
34dc7c2f 4709 CALLB_CPR_SAFE_END(&cpr, &l2arc_feed_thr_lock);
428870ff 4710 next = ddi_get_lbolt() + hz;
34dc7c2f
BB
4711
4712 /*
b128c09f 4713 * Quick check for L2ARC devices.
34dc7c2f
BB
4714 */
4715 mutex_enter(&l2arc_dev_mtx);
4716 if (l2arc_ndev == 0) {
4717 mutex_exit(&l2arc_dev_mtx);
4718 continue;
4719 }
b128c09f 4720 mutex_exit(&l2arc_dev_mtx);
428870ff 4721 begin = ddi_get_lbolt();
34dc7c2f
BB
4722
4723 /*
b128c09f
BB
4724 * This selects the next l2arc device to write to, and in
4725 * doing so the next spa to feed from: dev->l2ad_spa. This
4726 * will return NULL if there are now no l2arc devices or if
4727 * they are all faulted.
4728 *
4729 * If a device is returned, its spa's config lock is also
4730 * held to prevent device removal. l2arc_dev_get_next()
4731 * will grab and release l2arc_dev_mtx.
34dc7c2f 4732 */
b128c09f 4733 if ((dev = l2arc_dev_get_next()) == NULL)
34dc7c2f 4734 continue;
b128c09f
BB
4735
4736 spa = dev->l2ad_spa;
4737 ASSERT(spa != NULL);
34dc7c2f 4738
572e2857
BB
4739 /*
4740 * If the pool is read-only then force the feed thread to
4741 * sleep a little longer.
4742 */
4743 if (!spa_writeable(spa)) {
4744 next = ddi_get_lbolt() + 5 * l2arc_feed_secs * hz;
4745 spa_config_exit(spa, SCL_L2ARC, dev);
4746 continue;
4747 }
4748
34dc7c2f 4749 /*
b128c09f 4750 * Avoid contributing to memory pressure.
34dc7c2f 4751 */
302f753f 4752 if (arc_no_grow) {
b128c09f
BB
4753 ARCSTAT_BUMP(arcstat_l2_abort_lowmem);
4754 spa_config_exit(spa, SCL_L2ARC, dev);
34dc7c2f
BB
4755 continue;
4756 }
b128c09f 4757
34dc7c2f
BB
4758 ARCSTAT_BUMP(arcstat_l2_feeds);
4759
d164b209 4760 size = l2arc_write_size(dev);
b128c09f 4761
34dc7c2f
BB
4762 /*
4763 * Evict L2ARC buffers that will be overwritten.
4764 */
b128c09f 4765 l2arc_evict(dev, size, B_FALSE);
34dc7c2f
BB
4766
4767 /*
4768 * Write ARC buffers.
4769 */
d164b209
BB
4770 wrote = l2arc_write_buffers(spa, dev, size);
4771
4772 /*
4773 * Calculate interval between writes.
4774 */
4775 next = l2arc_write_interval(begin, size, wrote);
b128c09f 4776 spa_config_exit(spa, SCL_L2ARC, dev);
34dc7c2f
BB
4777 }
4778
4779 l2arc_thread_exit = 0;
4780 cv_broadcast(&l2arc_feed_thr_cv);
4781 CALLB_CPR_EXIT(&cpr); /* drops l2arc_feed_thr_lock */
4782 thread_exit();
4783}
4784
b128c09f
BB
4785boolean_t
4786l2arc_vdev_present(vdev_t *vd)
4787{
4788 l2arc_dev_t *dev;
4789
4790 mutex_enter(&l2arc_dev_mtx);
4791 for (dev = list_head(l2arc_dev_list); dev != NULL;
4792 dev = list_next(l2arc_dev_list, dev)) {
4793 if (dev->l2ad_vdev == vd)
4794 break;
4795 }
4796 mutex_exit(&l2arc_dev_mtx);
4797
4798 return (dev != NULL);
4799}
4800
34dc7c2f
BB
4801/*
4802 * Add a vdev for use by the L2ARC. By this point the spa has already
4803 * validated the vdev and opened it.
4804 */
4805void
9babb374 4806l2arc_add_vdev(spa_t *spa, vdev_t *vd)
34dc7c2f
BB
4807{
4808 l2arc_dev_t *adddev;
4809
b128c09f
BB
4810 ASSERT(!l2arc_vdev_present(vd));
4811
34dc7c2f
BB
4812 /*
4813 * Create a new l2arc device entry.
4814 */
4815 adddev = kmem_zalloc(sizeof (l2arc_dev_t), KM_SLEEP);
4816 adddev->l2ad_spa = spa;
4817 adddev->l2ad_vdev = vd;
4818 adddev->l2ad_write = l2arc_write_max;
b128c09f 4819 adddev->l2ad_boost = l2arc_write_boost;
9babb374
BB
4820 adddev->l2ad_start = VDEV_LABEL_START_SIZE;
4821 adddev->l2ad_end = VDEV_LABEL_START_SIZE + vdev_get_min_asize(vd);
34dc7c2f
BB
4822 adddev->l2ad_hand = adddev->l2ad_start;
4823 adddev->l2ad_evict = adddev->l2ad_start;
4824 adddev->l2ad_first = B_TRUE;
d164b209 4825 adddev->l2ad_writing = B_FALSE;
98f72a53 4826 list_link_init(&adddev->l2ad_node);
34dc7c2f
BB
4827 ASSERT3U(adddev->l2ad_write, >, 0);
4828
4829 /*
4830 * This is a list of all ARC buffers that are still valid on the
4831 * device.
4832 */
4833 adddev->l2ad_buflist = kmem_zalloc(sizeof (list_t), KM_SLEEP);
4834 list_create(adddev->l2ad_buflist, sizeof (arc_buf_hdr_t),
4835 offsetof(arc_buf_hdr_t, b_l2node));
4836
428870ff 4837 vdev_space_update(vd, 0, 0, adddev->l2ad_end - adddev->l2ad_hand);
34dc7c2f
BB
4838
4839 /*
4840 * Add device to global list
4841 */
4842 mutex_enter(&l2arc_dev_mtx);
4843 list_insert_head(l2arc_dev_list, adddev);
4844 atomic_inc_64(&l2arc_ndev);
4845 mutex_exit(&l2arc_dev_mtx);
4846}
4847
4848/*
4849 * Remove a vdev from the L2ARC.
4850 */
4851void
4852l2arc_remove_vdev(vdev_t *vd)
4853{
4854 l2arc_dev_t *dev, *nextdev, *remdev = NULL;
4855
34dc7c2f
BB
4856 /*
4857 * Find the device by vdev
4858 */
4859 mutex_enter(&l2arc_dev_mtx);
4860 for (dev = list_head(l2arc_dev_list); dev; dev = nextdev) {
4861 nextdev = list_next(l2arc_dev_list, dev);
4862 if (vd == dev->l2ad_vdev) {
4863 remdev = dev;
4864 break;
4865 }
4866 }
4867 ASSERT(remdev != NULL);
4868
4869 /*
4870 * Remove device from global list
4871 */
4872 list_remove(l2arc_dev_list, remdev);
4873 l2arc_dev_last = NULL; /* may have been invalidated */
b128c09f
BB
4874 atomic_dec_64(&l2arc_ndev);
4875 mutex_exit(&l2arc_dev_mtx);
34dc7c2f
BB
4876
4877 /*
4878 * Clear all buflists and ARC references. L2ARC device flush.
4879 */
4880 l2arc_evict(remdev, 0, B_TRUE);
4881 list_destroy(remdev->l2ad_buflist);
4882 kmem_free(remdev->l2ad_buflist, sizeof (list_t));
4883 kmem_free(remdev, sizeof (l2arc_dev_t));
34dc7c2f
BB
4884}
4885
4886void
b128c09f 4887l2arc_init(void)
34dc7c2f
BB
4888{
4889 l2arc_thread_exit = 0;
4890 l2arc_ndev = 0;
4891 l2arc_writes_sent = 0;
4892 l2arc_writes_done = 0;
4893
4894 mutex_init(&l2arc_feed_thr_lock, NULL, MUTEX_DEFAULT, NULL);
4895 cv_init(&l2arc_feed_thr_cv, NULL, CV_DEFAULT, NULL);
4896 mutex_init(&l2arc_dev_mtx, NULL, MUTEX_DEFAULT, NULL);
4897 mutex_init(&l2arc_buflist_mtx, NULL, MUTEX_DEFAULT, NULL);
4898 mutex_init(&l2arc_free_on_write_mtx, NULL, MUTEX_DEFAULT, NULL);
4899
4900 l2arc_dev_list = &L2ARC_dev_list;
4901 l2arc_free_on_write = &L2ARC_free_on_write;
4902 list_create(l2arc_dev_list, sizeof (l2arc_dev_t),
4903 offsetof(l2arc_dev_t, l2ad_node));
4904 list_create(l2arc_free_on_write, sizeof (l2arc_data_free_t),
4905 offsetof(l2arc_data_free_t, l2df_list_node));
34dc7c2f
BB
4906}
4907
4908void
b128c09f 4909l2arc_fini(void)
34dc7c2f 4910{
b128c09f
BB
4911 /*
4912 * This is called from dmu_fini(), which is called from spa_fini();
4913 * Because of this, we can assume that all l2arc devices have
4914 * already been removed when the pools themselves were removed.
4915 */
4916
4917 l2arc_do_free_on_write();
34dc7c2f
BB
4918
4919 mutex_destroy(&l2arc_feed_thr_lock);
4920 cv_destroy(&l2arc_feed_thr_cv);
4921 mutex_destroy(&l2arc_dev_mtx);
4922 mutex_destroy(&l2arc_buflist_mtx);
4923 mutex_destroy(&l2arc_free_on_write_mtx);
4924
4925 list_destroy(l2arc_dev_list);
4926 list_destroy(l2arc_free_on_write);
4927}
b128c09f
BB
4928
4929void
4930l2arc_start(void)
4931{
fb5f0bc8 4932 if (!(spa_mode_global & FWRITE))
b128c09f
BB
4933 return;
4934
4935 (void) thread_create(NULL, 0, l2arc_feed_thread, NULL, 0, &p0,
4936 TS_RUN, minclsyspri);
4937}
4938
4939void
4940l2arc_stop(void)
4941{
fb5f0bc8 4942 if (!(spa_mode_global & FWRITE))
b128c09f
BB
4943 return;
4944
4945 mutex_enter(&l2arc_feed_thr_lock);
4946 cv_signal(&l2arc_feed_thr_cv); /* kick thread out of startup */
4947 l2arc_thread_exit = 1;
4948 while (l2arc_thread_exit != 0)
4949 cv_wait(&l2arc_feed_thr_cv, &l2arc_feed_thr_lock);
4950 mutex_exit(&l2arc_feed_thr_lock);
4951}
c28b2279
BB
4952
4953#if defined(_KERNEL) && defined(HAVE_SPL)
4954EXPORT_SYMBOL(arc_read);
4955EXPORT_SYMBOL(arc_buf_remove_ref);
4956EXPORT_SYMBOL(arc_getbuf_func);
ab26409d
BB
4957EXPORT_SYMBOL(arc_add_prune_callback);
4958EXPORT_SYMBOL(arc_remove_prune_callback);
c28b2279 4959
c409e464
BB
4960module_param(zfs_arc_min, ulong, 0444);
4961MODULE_PARM_DESC(zfs_arc_min, "Min arc size");
c28b2279 4962
c409e464
BB
4963module_param(zfs_arc_max, ulong, 0444);
4964MODULE_PARM_DESC(zfs_arc_max, "Max arc size");
c28b2279 4965
c409e464 4966module_param(zfs_arc_meta_limit, ulong, 0444);
c28b2279 4967MODULE_PARM_DESC(zfs_arc_meta_limit, "Meta limit for arc size");
6a8f9b6b 4968
ab26409d
BB
4969module_param(zfs_arc_meta_prune, int, 0444);
4970MODULE_PARM_DESC(zfs_arc_meta_prune, "Bytes of meta data to prune");
c409e464
BB
4971
4972module_param(zfs_arc_grow_retry, int, 0444);
4973MODULE_PARM_DESC(zfs_arc_grow_retry, "Seconds before growing arc size");
4974
4975module_param(zfs_arc_shrink_shift, int, 0444);
4976MODULE_PARM_DESC(zfs_arc_shrink_shift, "log2(fraction of arc to reclaim)");
4977
4978module_param(zfs_arc_p_min_shift, int, 0444);
4979MODULE_PARM_DESC(zfs_arc_p_min_shift, "arc_c shift to calc min/max arc_p");
4980
abd8610c
BB
4981module_param(l2arc_write_max, ulong, 0444);
4982MODULE_PARM_DESC(l2arc_write_max, "Max write bytes per interval");
4983
4984module_param(l2arc_write_boost, ulong, 0444);
4985MODULE_PARM_DESC(l2arc_write_boost, "Extra write bytes during device warmup");
4986
4987module_param(l2arc_headroom, ulong, 0444);
4988MODULE_PARM_DESC(l2arc_headroom, "Number of max device writes to precache");
4989
4990module_param(l2arc_feed_secs, ulong, 0444);
4991MODULE_PARM_DESC(l2arc_feed_secs, "Seconds between L2ARC writing");
4992
4993module_param(l2arc_feed_min_ms, ulong, 0444);
4994MODULE_PARM_DESC(l2arc_feed_min_ms, "Min feed interval in milliseconds");
4995
4996module_param(l2arc_noprefetch, int, 0444);
4997MODULE_PARM_DESC(l2arc_noprefetch, "Skip caching prefetched buffers");
4998
4999module_param(l2arc_feed_again, int, 0444);
5000MODULE_PARM_DESC(l2arc_feed_again, "Turbo L2ARC warmup");
5001
5002module_param(l2arc_norw, int, 0444);
5003MODULE_PARM_DESC(l2arc_norw, "No reads during writes");
5004
c28b2279 5005#endif