module/zfs/arc.c

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  23  * Copyright (c) 2012, Joyent, Inc. All rights reserved.
  24  * Copyright (c) 2011, 2015 by Delphix. All rights reserved.
  25  * Copyright (c) 2014 by Saso Kiselkov. All rights reserved.
  26  * Copyright 2014 Nexenta Systems, Inc.  All rights reserved.
  27  */
  28
  29 /*
  30  * DVA-based Adjustable Replacement Cache
  31  *
  32  * While much of the theory of operation used here is
  33  * based on the self-tuning, low overhead replacement cache
  34  * presented by Megiddo and Modha at FAST 2003, there are some
  35  * significant differences:
  36  *
  37  * 1. The Megiddo and Modha model assumes any page is evictable.
  38  * Pages in its cache cannot be "locked" into memory.  This makes
  39  * the eviction algorithm simple: evict the last page in the list.
  40  * This also make the performance characteristics easy to reason
  41  * about.  Our cache is not so simple.  At any given moment, some
  42  * subset of the blocks in the cache are un-evictable because we
  43  * have handed out a reference to them.  Blocks are only evictable
  44  * when there are no external references active.  This makes
  45  * eviction far more problematic:  we choose to evict the evictable
  46  * blocks that are the "lowest" in the list.
  47  *
  48  * There are times when it is not possible to evict the requested
  49  * space.  In these circumstances we are unable to adjust the cache
  50  * size.  To prevent the cache growing unbounded at these times we
  51  * implement a "cache throttle" that slows the flow of new data
  52  * into the cache until we can make space available.
  53  *
  54  * 2. The Megiddo and Modha model assumes a fixed cache size.
  55  * Pages are evicted when the cache is full and there is a cache
  56  * miss.  Our model has a variable sized cache.  It grows with
  57  * high use, but also tries to react to memory pressure from the
  58  * operating system: decreasing its size when system memory is
  59  * tight.
  60  *
  61  * 3. The Megiddo and Modha model assumes a fixed page size. All
  62  * elements of the cache are therefore exactly the same size.  So
  63  * when adjusting the cache size following a cache miss, its simply
  64  * a matter of choosing a single page to evict.  In our model, we
  65  * have variable sized cache blocks (rangeing from 512 bytes to
  66  * 128K bytes).  We therefore choose a set of blocks to evict to make
  67  * space for a cache miss that approximates as closely as possible
  68  * the space used by the new block.
  69  *
  70  * See also:  "ARC: A Self-Tuning, Low Overhead Replacement Cache"
  71  * by N. Megiddo & D. Modha, FAST 2003
  72  */
  73
  74 /*
  75  * The locking model:
  76  *
  77  * A new reference to a cache buffer can be obtained in two
  78  * ways: 1) via a hash table lookup using the DVA as a key,
  79  * or 2) via one of the ARC lists.  The arc_read() interface
  80  * uses method 1, while the internal arc algorithms for
  81  * adjusting the cache use method 2.  We therefore provide two
  82  * types of locks: 1) the hash table lock array, and 2) the
  83  * arc list locks.
  84  *
  85  * Buffers do not have their own mutexes, rather they rely on the
  86  * hash table mutexes for the bulk of their protection (i.e. most
  87  * fields in the arc_buf_hdr_t are protected by these mutexes).
  88  *
  89  * buf_hash_find() returns the appropriate mutex (held) when it
  90  * locates the requested buffer in the hash table.  It returns
  91  * NULL for the mutex if the buffer was not in the table.
  92  *
  93  * buf_hash_remove() expects the appropriate hash mutex to be
  94  * already held before it is invoked.
  95  *
  96  * Each arc state also has a mutex which is used to protect the
  97  * buffer list associated with the state.  When attempting to
  98  * obtain a hash table lock while holding an arc list lock you
  99  * must use: mutex_tryenter() to avoid deadlock.  Also note that
 100  * the active state mutex must be held before the ghost state mutex.
 101  *
 102  * Arc buffers may have an associated eviction callback function.
 103  * This function will be invoked prior to removing the buffer (e.g.
 104  * in arc_do_user_evicts()).  Note however that the data associated
 105  * with the buffer may be evicted prior to the callback.  The callback
 106  * must be made with *no locks held* (to prevent deadlock).  Additionally,
 107  * the users of callbacks must ensure that their private data is
 108  * protected from simultaneous callbacks from arc_clear_callback()
 109  * and arc_do_user_evicts().
 110  *
 111  * It as also possible to register a callback which is run when the
 112  * arc_meta_limit is reached and no buffers can be safely evicted.  In
 113  * this case the arc user should drop a reference on some arc buffers so
 114  * they can be reclaimed and the arc_meta_limit honored.  For example,
 115  * when using the ZPL each dentry holds a references on a znode.  These
 116  * dentries must be pruned before the arc buffer holding the znode can
 117  * be safely evicted.
 118  *
 119  * Note that the majority of the performance stats are manipulated
 120  * with atomic operations.
 121  *
 122  * The L2ARC uses the l2ad_mtx on each vdev for the following:
 123  *
 124  *      - L2ARC buflist creation
 125  *      - L2ARC buflist eviction
 126  *      - L2ARC write completion, which walks L2ARC buflists
 127  *      - ARC header destruction, as it removes from L2ARC buflists
 128  *      - ARC header release, as it removes from L2ARC buflists
 129  */
 130
 131 #include <sys/spa.h>
 132 #include <sys/zio.h>
 133 #include <sys/zio_compress.h>
 134 #include <sys/zfs_context.h>
 135 #include <sys/arc.h>
 136 #include <sys/refcount.h>
 137 #include <sys/vdev.h>
 138 #include <sys/vdev_impl.h>
 139 #include <sys/dsl_pool.h>
 140 #include <sys/multilist.h>
 141 #ifdef _KERNEL
 142 #include <sys/vmsystm.h>
 143 #include <vm/anon.h>
 144 #include <sys/fs/swapnode.h>
 145 #include <sys/zpl.h>
 146 #include <linux/mm_compat.h>
 147 #endif
 148 #include <sys/callb.h>
 149 #include <sys/kstat.h>
 150 #include <sys/dmu_tx.h>
 151 #include <zfs_fletcher.h>
 152 #include <sys/arc_impl.h>
 153 #include <sys/trace_arc.h>
 154
 155 #ifndef _KERNEL
 156 /* set with ZFS_DEBUG=watch, to enable watchpoints on frozen buffers */
 157 boolean_t arc_watch = B_FALSE;
 158 #endif
 159
 160 static kmutex_t         arc_reclaim_lock;
 161 static kcondvar_t       arc_reclaim_thread_cv;
 162 static boolean_t        arc_reclaim_thread_exit;
 163 static kcondvar_t       arc_reclaim_waiters_cv;
 164
 165 static kmutex_t         arc_user_evicts_lock;
 166 static kcondvar_t       arc_user_evicts_cv;
 167 static boolean_t        arc_user_evicts_thread_exit;
 168
 169 /*
 170  * The number of headers to evict in arc_evict_state_impl() before
 171  * dropping the sublist lock and evicting from another sublist. A lower
 172  * value means we're more likely to evict the "correct" header (i.e. the
 173  * oldest header in the arc state), but comes with higher overhead
 174  * (i.e. more invocations of arc_evict_state_impl()).
 175  */
 176 int zfs_arc_evict_batch_limit = 10;
 177
 178 /*
 179  * The number of sublists used for each of the arc state lists. If this
 180  * is not set to a suitable value by the user, it will be configured to
 181  * the number of CPUs on the system in arc_init().
 182  */
 183 int zfs_arc_num_sublists_per_state = 0;
 184
 185 /* number of seconds before growing cache again */
 186 static int              arc_grow_retry = 5;
 187
 188 /* shift of arc_c for calculating overflow limit in arc_get_data_buf */
 189 int             zfs_arc_overflow_shift = 8;
 190
 191 /* shift of arc_c for calculating both min and max arc_p */
 192 static int              arc_p_min_shift = 4;
 193
 194 /* log2(fraction of arc to reclaim) */
 195 static int              arc_shrink_shift = 7;
 196
 197 /*
 198  * log2(fraction of ARC which must be free to allow growing).
 199  * I.e. If there is less than arc_c >> arc_no_grow_shift free memory,
 200  * when reading a new block into the ARC, we will evict an equal-sized block
 201  * from the ARC.
 202  *
 203  * This must be less than arc_shrink_shift, so that when we shrink the ARC,
 204  * we will still not allow it to grow.
 205  */
 206 int                     arc_no_grow_shift = 5;
 207
 208
 209 /*
 210  * minimum lifespan of a prefetch block in clock ticks
 211  * (initialized in arc_init())
 212  */
 213 static int              arc_min_prefetch_lifespan;
 214
 215 /*
 216  * If this percent of memory is free, don't throttle.
 217  */
 218 int arc_lotsfree_percent = 10;
 219
 220 static int arc_dead;
 221
 222 /*
 223  * The arc has filled available memory and has now warmed up.
 224  */
 225 static boolean_t arc_warm;
 226
 227 /*
 228  * These tunables are for performance analysis.
 229  */
 230 unsigned long zfs_arc_max = 0;
 231 unsigned long zfs_arc_min = 0;
 232 unsigned long zfs_arc_meta_limit = 0;
 233 unsigned long zfs_arc_meta_min = 0;
 234 int zfs_arc_grow_retry = 0;
 235 int zfs_arc_shrink_shift = 0;
 236 int zfs_arc_p_min_shift = 0;
 237 int zfs_disable_dup_eviction = 0;
 238 int zfs_arc_average_blocksize = 8 * 1024; /* 8KB */
 239
 240 /*
 241  * These tunables are Linux specific
 242  */
 243 int zfs_arc_memory_throttle_disable = 1;
 244 int zfs_arc_min_prefetch_lifespan = 0;
 245 int zfs_arc_p_aggressive_disable = 1;
 246 int zfs_arc_p_dampener_disable = 1;
 247 int zfs_arc_meta_prune = 10000;
 248 int zfs_arc_meta_strategy = ARC_STRATEGY_META_BALANCED;
 249 int zfs_arc_meta_adjust_restarts = 4096;
 250
 251 /* The 6 states: */
 252 static arc_state_t ARC_anon;
 253 static arc_state_t ARC_mru;
 254 static arc_state_t ARC_mru_ghost;
 255 static arc_state_t ARC_mfu;
 256 static arc_state_t ARC_mfu_ghost;
 257 static arc_state_t ARC_l2c_only;
 258
 259 typedef struct arc_stats {
 260         kstat_named_t arcstat_hits;
 261         kstat_named_t arcstat_misses;
 262         kstat_named_t arcstat_demand_data_hits;
 263         kstat_named_t arcstat_demand_data_misses;
 264         kstat_named_t arcstat_demand_metadata_hits;
 265         kstat_named_t arcstat_demand_metadata_misses;
 266         kstat_named_t arcstat_prefetch_data_hits;
 267         kstat_named_t arcstat_prefetch_data_misses;
 268         kstat_named_t arcstat_prefetch_metadata_hits;
 269         kstat_named_t arcstat_prefetch_metadata_misses;
 270         kstat_named_t arcstat_mru_hits;
 271         kstat_named_t arcstat_mru_ghost_hits;
 272         kstat_named_t arcstat_mfu_hits;
 273         kstat_named_t arcstat_mfu_ghost_hits;
 274         kstat_named_t arcstat_deleted;
 275         /*
 276          * Number of buffers that could not be evicted because the hash lock
 277          * was held by another thread.  The lock may not necessarily be held
 278          * by something using the same buffer, since hash locks are shared
 279          * by multiple buffers.
 280          */
 281         kstat_named_t arcstat_mutex_miss;
 282         /*
 283          * Number of buffers skipped because they have I/O in progress, are
 284          * indrect prefetch buffers that have not lived long enough, or are
 285          * not from the spa we're trying to evict from.
 286          */
 287         kstat_named_t arcstat_evict_skip;
 288         /*
 289          * Number of times arc_evict_state() was unable to evict enough
 290          * buffers to reach its target amount.
 291          */
 292         kstat_named_t arcstat_evict_not_enough;
 293         kstat_named_t arcstat_evict_l2_cached;
 294         kstat_named_t arcstat_evict_l2_eligible;
 295         kstat_named_t arcstat_evict_l2_ineligible;
 296         kstat_named_t arcstat_evict_l2_skip;
 297         kstat_named_t arcstat_hash_elements;
 298         kstat_named_t arcstat_hash_elements_max;
 299         kstat_named_t arcstat_hash_collisions;
 300         kstat_named_t arcstat_hash_chains;
 301         kstat_named_t arcstat_hash_chain_max;
 302         kstat_named_t arcstat_p;
 303         kstat_named_t arcstat_c;
 304         kstat_named_t arcstat_c_min;
 305         kstat_named_t arcstat_c_max;
 306         kstat_named_t arcstat_size;
 307         /*
 308          * Number of bytes consumed by internal ARC structures necessary
 309          * for tracking purposes; these structures are not actually
 310          * backed by ARC buffers. This includes arc_buf_hdr_t structures
 311          * (allocated via arc_buf_hdr_t_full and arc_buf_hdr_t_l2only
 312          * caches), and arc_buf_t structures (allocated via arc_buf_t
 313          * cache).
 314          */
 315         kstat_named_t arcstat_hdr_size;
 316         /*
 317          * Number of bytes consumed by ARC buffers of type equal to
 318          * ARC_BUFC_DATA. This is generally consumed by buffers backing
 319          * on disk user data (e.g. plain file contents).
 320          */
 321         kstat_named_t arcstat_data_size;
 322         /*
 323          * Number of bytes consumed by ARC buffers of type equal to
 324          * ARC_BUFC_METADATA. This is generally consumed by buffers
 325          * backing on disk data that is used for internal ZFS
 326          * structures (e.g. ZAP, dnode, indirect blocks, etc).
 327          */
 328         kstat_named_t arcstat_metadata_size;
 329         /*
 330          * Number of bytes consumed by various buffers and structures
 331          * not actually backed with ARC buffers. This includes bonus
 332          * buffers (allocated directly via zio_buf_* functions),
 333          * dmu_buf_impl_t structures (allocated via dmu_buf_impl_t
 334          * cache), and dnode_t structures (allocated via dnode_t cache).
 335          */
 336         kstat_named_t arcstat_other_size;
 337         /*
 338          * Total number of bytes consumed by ARC buffers residing in the
 339          * arc_anon state. This includes *all* buffers in the arc_anon
 340          * state; e.g. data, metadata, evictable, and unevictable buffers
 341          * are all included in this value.
 342          */
 343         kstat_named_t arcstat_anon_size;
 344         /*
 345          * Number of bytes consumed by ARC buffers that meet the
 346          * following criteria: backing buffers of type ARC_BUFC_DATA,
 347          * residing in the arc_anon state, and are eligible for eviction
 348          * (e.g. have no outstanding holds on the buffer).
 349          */
 350         kstat_named_t arcstat_anon_evictable_data;
 351         /*
 352          * Number of bytes consumed by ARC buffers that meet the
 353          * following criteria: backing buffers of type ARC_BUFC_METADATA,
 354          * residing in the arc_anon state, and are eligible for eviction
 355          * (e.g. have no outstanding holds on the buffer).
 356          */
 357         kstat_named_t arcstat_anon_evictable_metadata;
 358         /*
 359          * Total number of bytes consumed by ARC buffers residing in the
 360          * arc_mru state. This includes *all* buffers in the arc_mru
 361          * state; e.g. data, metadata, evictable, and unevictable buffers
 362          * are all included in this value.
 363          */
 364         kstat_named_t arcstat_mru_size;
 365         /*
 366          * Number of bytes consumed by ARC buffers that meet the
 367          * following criteria: backing buffers of type ARC_BUFC_DATA,
 368          * residing in the arc_mru state, and are eligible for eviction
 369          * (e.g. have no outstanding holds on the buffer).
 370          */
 371         kstat_named_t arcstat_mru_evictable_data;
 372         /*
 373          * Number of bytes consumed by ARC buffers that meet the
 374          * following criteria: backing buffers of type ARC_BUFC_METADATA,
 375          * residing in the arc_mru state, and are eligible for eviction
 376          * (e.g. have no outstanding holds on the buffer).
 377          */
 378         kstat_named_t arcstat_mru_evictable_metadata;
 379         /*
 380          * Total number of bytes that *would have been* consumed by ARC
 381          * buffers in the arc_mru_ghost state. The key thing to note
 382          * here, is the fact that this size doesn't actually indicate
 383          * RAM consumption. The ghost lists only consist of headers and
 384          * don't actually have ARC buffers linked off of these headers.
 385          * Thus, *if* the headers had associated ARC buffers, these
 386          * buffers *would have* consumed this number of bytes.
 387          */
 388         kstat_named_t arcstat_mru_ghost_size;
 389         /*
 390          * Number of bytes that *would have been* consumed by ARC
 391          * buffers that are eligible for eviction, of type
 392          * ARC_BUFC_DATA, and linked off the arc_mru_ghost state.
 393          */
 394         kstat_named_t arcstat_mru_ghost_evictable_data;
 395         /*
 396          * Number of bytes that *would have been* consumed by ARC
 397          * buffers that are eligible for eviction, of type
 398          * ARC_BUFC_METADATA, and linked off the arc_mru_ghost state.
 399          */
 400         kstat_named_t arcstat_mru_ghost_evictable_metadata;
 401         /*
 402          * Total number of bytes consumed by ARC buffers residing in the
 403          * arc_mfu state. This includes *all* buffers in the arc_mfu
 404          * state; e.g. data, metadata, evictable, and unevictable buffers
 405          * are all included in this value.
 406          */
 407         kstat_named_t arcstat_mfu_size;
 408         /*
 409          * Number of bytes consumed by ARC buffers that are eligible for
 410          * eviction, of type ARC_BUFC_DATA, and reside in the arc_mfu
 411          * state.
 412          */
 413         kstat_named_t arcstat_mfu_evictable_data;
 414         /*
 415          * Number of bytes consumed by ARC buffers that are eligible for
 416          * eviction, of type ARC_BUFC_METADATA, and reside in the
 417          * arc_mfu state.
 418          */
 419         kstat_named_t arcstat_mfu_evictable_metadata;
 420         /*
 421          * Total number of bytes that *would have been* consumed by ARC
 422          * buffers in the arc_mfu_ghost state. See the comment above
 423          * arcstat_mru_ghost_size for more details.
 424          */
 425         kstat_named_t arcstat_mfu_ghost_size;
 426         /*
 427          * Number of bytes that *would have been* consumed by ARC
 428          * buffers that are eligible for eviction, of type
 429          * ARC_BUFC_DATA, and linked off the arc_mfu_ghost state.
 430          */
 431         kstat_named_t arcstat_mfu_ghost_evictable_data;
 432         /*
 433          * Number of bytes that *would have been* consumed by ARC
 434          * buffers that are eligible for eviction, of type
 435          * ARC_BUFC_METADATA, and linked off the arc_mru_ghost state.
 436          */
 437         kstat_named_t arcstat_mfu_ghost_evictable_metadata;
 438         kstat_named_t arcstat_l2_hits;
 439         kstat_named_t arcstat_l2_misses;
 440         kstat_named_t arcstat_l2_feeds;
 441         kstat_named_t arcstat_l2_rw_clash;
 442         kstat_named_t arcstat_l2_read_bytes;
 443         kstat_named_t arcstat_l2_write_bytes;
 444         kstat_named_t arcstat_l2_writes_sent;
 445         kstat_named_t arcstat_l2_writes_done;
 446         kstat_named_t arcstat_l2_writes_error;
 447         kstat_named_t arcstat_l2_writes_lock_retry;
 448         kstat_named_t arcstat_l2_evict_lock_retry;
 449         kstat_named_t arcstat_l2_evict_reading;
 450         kstat_named_t arcstat_l2_evict_l1cached;
 451         kstat_named_t arcstat_l2_free_on_write;
 452         kstat_named_t arcstat_l2_cdata_free_on_write;
 453         kstat_named_t arcstat_l2_abort_lowmem;
 454         kstat_named_t arcstat_l2_cksum_bad;
 455         kstat_named_t arcstat_l2_io_error;
 456         kstat_named_t arcstat_l2_size;
 457         kstat_named_t arcstat_l2_asize;
 458         kstat_named_t arcstat_l2_hdr_size;
 459         kstat_named_t arcstat_l2_compress_successes;
 460         kstat_named_t arcstat_l2_compress_zeros;
 461         kstat_named_t arcstat_l2_compress_failures;
 462         kstat_named_t arcstat_memory_throttle_count;
 463         kstat_named_t arcstat_duplicate_buffers;
 464         kstat_named_t arcstat_duplicate_buffers_size;
 465         kstat_named_t arcstat_duplicate_reads;
 466         kstat_named_t arcstat_memory_direct_count;
 467         kstat_named_t arcstat_memory_indirect_count;
 468         kstat_named_t arcstat_no_grow;
 469         kstat_named_t arcstat_tempreserve;
 470         kstat_named_t arcstat_loaned_bytes;
 471         kstat_named_t arcstat_prune;
 472         kstat_named_t arcstat_meta_used;
 473         kstat_named_t arcstat_meta_limit;
 474         kstat_named_t arcstat_meta_max;
 475         kstat_named_t arcstat_meta_min;
 476 } arc_stats_t;
 477
 478 static arc_stats_t arc_stats = {
 479         { "hits",                       KSTAT_DATA_UINT64 },
 480         { "misses",                     KSTAT_DATA_UINT64 },
 481         { "demand_data_hits",           KSTAT_DATA_UINT64 },
 482         { "demand_data_misses",         KSTAT_DATA_UINT64 },
 483         { "demand_metadata_hits",       KSTAT_DATA_UINT64 },
 484         { "demand_metadata_misses",     KSTAT_DATA_UINT64 },
 485         { "prefetch_data_hits",         KSTAT_DATA_UINT64 },
 486         { "prefetch_data_misses",       KSTAT_DATA_UINT64 },
 487         { "prefetch_metadata_hits",     KSTAT_DATA_UINT64 },
 488         { "prefetch_metadata_misses",   KSTAT_DATA_UINT64 },
 489         { "mru_hits",                   KSTAT_DATA_UINT64 },
 490         { "mru_ghost_hits",             KSTAT_DATA_UINT64 },
 491         { "mfu_hits",                   KSTAT_DATA_UINT64 },
 492         { "mfu_ghost_hits",             KSTAT_DATA_UINT64 },
 493         { "deleted",                    KSTAT_DATA_UINT64 },
 494         { "mutex_miss",                 KSTAT_DATA_UINT64 },
 495         { "evict_skip",                 KSTAT_DATA_UINT64 },
 496         { "evict_not_enough",           KSTAT_DATA_UINT64 },
 497         { "evict_l2_cached",            KSTAT_DATA_UINT64 },
 498         { "evict_l2_eligible",          KSTAT_DATA_UINT64 },
 499         { "evict_l2_ineligible",        KSTAT_DATA_UINT64 },
 500         { "evict_l2_skip",              KSTAT_DATA_UINT64 },
 501         { "hash_elements",              KSTAT_DATA_UINT64 },
 502         { "hash_elements_max",          KSTAT_DATA_UINT64 },
 503         { "hash_collisions",            KSTAT_DATA_UINT64 },
 504         { "hash_chains",                KSTAT_DATA_UINT64 },
 505         { "hash_chain_max",             KSTAT_DATA_UINT64 },
 506         { "p",                          KSTAT_DATA_UINT64 },
 507         { "c",                          KSTAT_DATA_UINT64 },
 508         { "c_min",                      KSTAT_DATA_UINT64 },
 509         { "c_max",                      KSTAT_DATA_UINT64 },
 510         { "size",                       KSTAT_DATA_UINT64 },
 511         { "hdr_size",                   KSTAT_DATA_UINT64 },
 512         { "data_size",                  KSTAT_DATA_UINT64 },
 513         { "metadata_size",              KSTAT_DATA_UINT64 },
 514         { "other_size",                 KSTAT_DATA_UINT64 },
 515         { "anon_size",                  KSTAT_DATA_UINT64 },
 516         { "anon_evictable_data",        KSTAT_DATA_UINT64 },
 517         { "anon_evictable_metadata",    KSTAT_DATA_UINT64 },
 518         { "mru_size",                   KSTAT_DATA_UINT64 },
 519         { "mru_evictable_data",         KSTAT_DATA_UINT64 },
 520         { "mru_evictable_metadata",     KSTAT_DATA_UINT64 },
 521         { "mru_ghost_size",             KSTAT_DATA_UINT64 },
 522         { "mru_ghost_evictable_data",   KSTAT_DATA_UINT64 },
 523         { "mru_ghost_evictable_metadata", KSTAT_DATA_UINT64 },
 524         { "mfu_size",                   KSTAT_DATA_UINT64 },
 525         { "mfu_evictable_data",         KSTAT_DATA_UINT64 },
 526         { "mfu_evictable_metadata",     KSTAT_DATA_UINT64 },
 527         { "mfu_ghost_size",             KSTAT_DATA_UINT64 },
 528         { "mfu_ghost_evictable_data",   KSTAT_DATA_UINT64 },
 529         { "mfu_ghost_evictable_metadata", KSTAT_DATA_UINT64 },
 530         { "l2_hits",                    KSTAT_DATA_UINT64 },
 531         { "l2_misses",                  KSTAT_DATA_UINT64 },
 532         { "l2_feeds",                   KSTAT_DATA_UINT64 },
 533         { "l2_rw_clash",                KSTAT_DATA_UINT64 },
 534         { "l2_read_bytes",              KSTAT_DATA_UINT64 },
 535         { "l2_write_bytes",             KSTAT_DATA_UINT64 },
 536         { "l2_writes_sent",             KSTAT_DATA_UINT64 },
 537         { "l2_writes_done",             KSTAT_DATA_UINT64 },
 538         { "l2_writes_error",            KSTAT_DATA_UINT64 },
 539         { "l2_writes_lock_retry",       KSTAT_DATA_UINT64 },
 540         { "l2_evict_lock_retry",        KSTAT_DATA_UINT64 },
 541         { "l2_evict_reading",           KSTAT_DATA_UINT64 },
 542         { "l2_evict_l1cached",          KSTAT_DATA_UINT64 },
 543         { "l2_free_on_write",           KSTAT_DATA_UINT64 },
 544         { "l2_cdata_free_on_write",     KSTAT_DATA_UINT64 },
 545         { "l2_abort_lowmem",            KSTAT_DATA_UINT64 },
 546         { "l2_cksum_bad",               KSTAT_DATA_UINT64 },
 547         { "l2_io_error",                KSTAT_DATA_UINT64 },
 548         { "l2_size",                    KSTAT_DATA_UINT64 },
 549         { "l2_asize",                   KSTAT_DATA_UINT64 },
 550         { "l2_hdr_size",                KSTAT_DATA_UINT64 },
 551         { "l2_compress_successes",      KSTAT_DATA_UINT64 },
 552         { "l2_compress_zeros",          KSTAT_DATA_UINT64 },
 553         { "l2_compress_failures",       KSTAT_DATA_UINT64 },
 554         { "memory_throttle_count",      KSTAT_DATA_UINT64 },
 555         { "duplicate_buffers",          KSTAT_DATA_UINT64 },
 556         { "duplicate_buffers_size",     KSTAT_DATA_UINT64 },
 557         { "duplicate_reads",            KSTAT_DATA_UINT64 },
 558         { "memory_direct_count",        KSTAT_DATA_UINT64 },
 559         { "memory_indirect_count",      KSTAT_DATA_UINT64 },
 560         { "arc_no_grow",                KSTAT_DATA_UINT64 },
 561         { "arc_tempreserve",            KSTAT_DATA_UINT64 },
 562         { "arc_loaned_bytes",           KSTAT_DATA_UINT64 },
 563         { "arc_prune",                  KSTAT_DATA_UINT64 },
 564         { "arc_meta_used",              KSTAT_DATA_UINT64 },
 565         { "arc_meta_limit",             KSTAT_DATA_UINT64 },
 566         { "arc_meta_max",               KSTAT_DATA_UINT64 },
 567         { "arc_meta_min",               KSTAT_DATA_UINT64 }
 568 };
 569
 570 #define ARCSTAT(stat)   (arc_stats.stat.value.ui64)
 571
 572 #define ARCSTAT_INCR(stat, val) \
 573         atomic_add_64(&arc_stats.stat.value.ui64, (val))
 574
 575 #define ARCSTAT_BUMP(stat)      ARCSTAT_INCR(stat, 1)
 576 #define ARCSTAT_BUMPDOWN(stat)  ARCSTAT_INCR(stat, -1)
 577
 578 #define ARCSTAT_MAX(stat, val) {                                        \
 579         uint64_t m;                                                     \
 580         while ((val) > (m = arc_stats.stat.value.ui64) &&               \
 581             (m != atomic_cas_64(&arc_stats.stat.value.ui64, m, (val)))) \
 582                 continue;                                               \
 583 }
 584
 585 #define ARCSTAT_MAXSTAT(stat) \
 586         ARCSTAT_MAX(stat##_max, arc_stats.stat.value.ui64)
 587
 588 /*
 589  * We define a macro to allow ARC hits/misses to be easily broken down by
 590  * two separate conditions, giving a total of four different subtypes for
 591  * each of hits and misses (so eight statistics total).
 592  */
 593 #define ARCSTAT_CONDSTAT(cond1, stat1, notstat1, cond2, stat2, notstat2, stat) \
 594         if (cond1) {                                                    \
 595                 if (cond2) {                                            \
 596                         ARCSTAT_BUMP(arcstat_##stat1##_##stat2##_##stat); \
 597                 } else {                                                \
 598                         ARCSTAT_BUMP(arcstat_##stat1##_##notstat2##_##stat); \
 599                 }                                                       \
 600         } else {                                                        \
 601                 if (cond2) {                                            \
 602                         ARCSTAT_BUMP(arcstat_##notstat1##_##stat2##_##stat); \
 603                 } else {                                                \
 604                         ARCSTAT_BUMP(arcstat_##notstat1##_##notstat2##_##stat);\
 605                 }                                                       \
 606         }
 607
 608 kstat_t                 *arc_ksp;
 609 static arc_state_t      *arc_anon;
 610 static arc_state_t      *arc_mru;
 611 static arc_state_t      *arc_mru_ghost;
 612 static arc_state_t      *arc_mfu;
 613 static arc_state_t      *arc_mfu_ghost;
 614 static arc_state_t      *arc_l2c_only;
 615
 616 /*
 617  * There are several ARC variables that are critical to export as kstats --
 618  * but we don't want to have to grovel around in the kstat whenever we wish to
 619  * manipulate them.  For these variables, we therefore define them to be in
 620  * terms of the statistic variable.  This assures that we are not introducing
 621  * the possibility of inconsistency by having shadow copies of the variables,
 622  * while still allowing the code to be readable.
 623  */
 624 #define arc_size        ARCSTAT(arcstat_size)   /* actual total arc size */
 625 #define arc_p           ARCSTAT(arcstat_p)      /* target size of MRU */
 626 #define arc_c           ARCSTAT(arcstat_c)      /* target size of cache */
 627 #define arc_c_min       ARCSTAT(arcstat_c_min)  /* min target cache size */
 628 #define arc_c_max       ARCSTAT(arcstat_c_max)  /* max target cache size */
 629 #define arc_no_grow     ARCSTAT(arcstat_no_grow)
 630 #define arc_tempreserve ARCSTAT(arcstat_tempreserve)
 631 #define arc_loaned_bytes        ARCSTAT(arcstat_loaned_bytes)
 632 #define arc_meta_limit  ARCSTAT(arcstat_meta_limit) /* max size for metadata */
 633 #define arc_meta_min    ARCSTAT(arcstat_meta_min) /* min size for metadata */
 634 #define arc_meta_used   ARCSTAT(arcstat_meta_used) /* size of metadata */
 635 #define arc_meta_max    ARCSTAT(arcstat_meta_max) /* max size of metadata */
 636
 637 #define L2ARC_IS_VALID_COMPRESS(_c_) \
 638         ((_c_) == ZIO_COMPRESS_LZ4 || (_c_) == ZIO_COMPRESS_EMPTY)
 639
 640 static list_t arc_prune_list;
 641 static kmutex_t arc_prune_mtx;
 642 static taskq_t *arc_prune_taskq;
 643 static arc_buf_t *arc_eviction_list;
 644 static arc_buf_hdr_t arc_eviction_hdr;
 645
 646 #define GHOST_STATE(state)      \
 647         ((state) == arc_mru_ghost || (state) == arc_mfu_ghost ||        \
 648         (state) == arc_l2c_only)
 649
 650 #define HDR_IN_HASH_TABLE(hdr)  ((hdr)->b_flags & ARC_FLAG_IN_HASH_TABLE)
 651 #define HDR_IO_IN_PROGRESS(hdr) ((hdr)->b_flags & ARC_FLAG_IO_IN_PROGRESS)
 652 #define HDR_IO_ERROR(hdr)       ((hdr)->b_flags & ARC_FLAG_IO_ERROR)
 653 #define HDR_PREFETCH(hdr)       ((hdr)->b_flags & ARC_FLAG_PREFETCH)
 654 #define HDR_FREED_IN_READ(hdr)  ((hdr)->b_flags & ARC_FLAG_FREED_IN_READ)
 655 #define HDR_BUF_AVAILABLE(hdr)  ((hdr)->b_flags & ARC_FLAG_BUF_AVAILABLE)
 656
 657 #define HDR_L2CACHE(hdr)        ((hdr)->b_flags & ARC_FLAG_L2CACHE)
 658 #define HDR_L2COMPRESS(hdr)     ((hdr)->b_flags & ARC_FLAG_L2COMPRESS)
 659 #define HDR_L2_READING(hdr)     \
 660             (((hdr)->b_flags & ARC_FLAG_IO_IN_PROGRESS) &&      \
 661             ((hdr)->b_flags & ARC_FLAG_HAS_L2HDR))
 662 #define HDR_L2_WRITING(hdr)     ((hdr)->b_flags & ARC_FLAG_L2_WRITING)
 663 #define HDR_L2_EVICTED(hdr)     ((hdr)->b_flags & ARC_FLAG_L2_EVICTED)
 664 #define HDR_L2_WRITE_HEAD(hdr)  ((hdr)->b_flags & ARC_FLAG_L2_WRITE_HEAD)
 665
 666 #define HDR_ISTYPE_METADATA(hdr)        \
 667             ((hdr)->b_flags & ARC_FLAG_BUFC_METADATA)
 668 #define HDR_ISTYPE_DATA(hdr)    (!HDR_ISTYPE_METADATA(hdr))
 669
 670 #define HDR_HAS_L1HDR(hdr)      ((hdr)->b_flags & ARC_FLAG_HAS_L1HDR)
 671 #define HDR_HAS_L2HDR(hdr)      ((hdr)->b_flags & ARC_FLAG_HAS_L2HDR)
 672
 673 /* For storing compression mode in b_flags */
 674 #define HDR_COMPRESS_OFFSET     24
 675 #define HDR_COMPRESS_NBITS      7
 676
 677 #define HDR_GET_COMPRESS(hdr)   ((enum zio_compress)BF32_GET(hdr->b_flags, \
 678             HDR_COMPRESS_OFFSET, HDR_COMPRESS_NBITS))
 679 #define HDR_SET_COMPRESS(hdr, cmp) BF32_SET(hdr->b_flags, \
 680             HDR_COMPRESS_OFFSET, HDR_COMPRESS_NBITS, (cmp))
 681
 682 /*
 683  * Other sizes
 684  */
 685
 686 #define HDR_FULL_SIZE ((int64_t)sizeof (arc_buf_hdr_t))
 687 #define HDR_L2ONLY_SIZE ((int64_t)offsetof(arc_buf_hdr_t, b_l1hdr))
 688
 689 /*
 690  * Hash table routines
 691  */
 692
 693 #define HT_LOCK_ALIGN   64
 694 #define HT_LOCK_PAD     (P2NPHASE(sizeof (kmutex_t), (HT_LOCK_ALIGN)))
 695
 696 struct ht_lock {
 697         kmutex_t        ht_lock;
 698 #ifdef _KERNEL
 699         unsigned char   pad[HT_LOCK_PAD];
 700 #endif
 701 };
 702
 703 #define BUF_LOCKS 8192
 704 typedef struct buf_hash_table {
 705         uint64_t ht_mask;
 706         arc_buf_hdr_t **ht_table;
 707         struct ht_lock ht_locks[BUF_LOCKS];
 708 } buf_hash_table_t;
 709
 710 static buf_hash_table_t buf_hash_table;
 711
 712 #define BUF_HASH_INDEX(spa, dva, birth) \
 713         (buf_hash(spa, dva, birth) & buf_hash_table.ht_mask)
 714 #define BUF_HASH_LOCK_NTRY(idx) (buf_hash_table.ht_locks[idx & (BUF_LOCKS-1)])
 715 #define BUF_HASH_LOCK(idx)      (&(BUF_HASH_LOCK_NTRY(idx).ht_lock))
 716 #define HDR_LOCK(hdr) \
 717         (BUF_HASH_LOCK(BUF_HASH_INDEX(hdr->b_spa, &hdr->b_dva, hdr->b_birth)))
 718
 719 uint64_t zfs_crc64_table[256];
 720
 721 /*
 722  * Level 2 ARC
 723  */
 724
 725 #define L2ARC_WRITE_SIZE        (8 * 1024 * 1024)       /* initial write max */
 726 #define L2ARC_HEADROOM          2                       /* num of writes */
 727 /*
 728  * If we discover during ARC scan any buffers to be compressed, we boost
 729  * our headroom for the next scanning cycle by this percentage multiple.
 730  */
 731 #define L2ARC_HEADROOM_BOOST    200
 732 #define L2ARC_FEED_SECS         1               /* caching interval secs */
 733 #define L2ARC_FEED_MIN_MS       200             /* min caching interval ms */
 734
 735 /*
 736  * Used to distinguish headers that are being process by
 737  * l2arc_write_buffers(), but have yet to be assigned to a l2arc disk
 738  * address. This can happen when the header is added to the l2arc's list
 739  * of buffers to write in the first stage of l2arc_write_buffers(), but
 740  * has not yet been written out which happens in the second stage of
 741  * l2arc_write_buffers().
 742  */
 743 #define L2ARC_ADDR_UNSET        ((uint64_t)(-1))
 744
 745 #define l2arc_writes_sent       ARCSTAT(arcstat_l2_writes_sent)
 746 #define l2arc_writes_done       ARCSTAT(arcstat_l2_writes_done)
 747
 748 /* L2ARC Performance Tunables */
 749 unsigned long l2arc_write_max = L2ARC_WRITE_SIZE;       /* def max write size */
 750 unsigned long l2arc_write_boost = L2ARC_WRITE_SIZE;     /* extra warmup write */
 751 unsigned long l2arc_headroom = L2ARC_HEADROOM;          /* # of dev writes */
 752 unsigned long l2arc_headroom_boost = L2ARC_HEADROOM_BOOST;
 753 unsigned long l2arc_feed_secs = L2ARC_FEED_SECS;        /* interval seconds */
 754 unsigned long l2arc_feed_min_ms = L2ARC_FEED_MIN_MS;    /* min interval msecs */
 755 int l2arc_noprefetch = B_TRUE;                  /* don't cache prefetch bufs */
 756 int l2arc_nocompress = B_FALSE;                 /* don't compress bufs */
 757 int l2arc_feed_again = B_TRUE;                  /* turbo warmup */
 758 int l2arc_norw = B_FALSE;                       /* no reads during writes */
 759
 760 /*
 761  * L2ARC Internals
 762  */
 763 static list_t L2ARC_dev_list;                   /* device list */
 764 static list_t *l2arc_dev_list;                  /* device list pointer */
 765 static kmutex_t l2arc_dev_mtx;                  /* device list mutex */
 766 static l2arc_dev_t *l2arc_dev_last;             /* last device used */
 767 static list_t L2ARC_free_on_write;              /* free after write buf list */
 768 static list_t *l2arc_free_on_write;             /* free after write list ptr */
 769 static kmutex_t l2arc_free_on_write_mtx;        /* mutex for list */
 770 static uint64_t l2arc_ndev;                     /* number of devices */
 771
 772 typedef struct l2arc_read_callback {
 773         arc_buf_t               *l2rcb_buf;             /* read buffer */
 774         spa_t                   *l2rcb_spa;             /* spa */
 775         blkptr_t                l2rcb_bp;               /* original blkptr */
 776         zbookmark_phys_t        l2rcb_zb;               /* original bookmark */
 777         int                     l2rcb_flags;            /* original flags */
 778         enum zio_compress       l2rcb_compress;         /* applied compress */
 779 } l2arc_read_callback_t;
 780
 781 typedef struct l2arc_data_free {
 782         /* protected by l2arc_free_on_write_mtx */
 783         void            *l2df_data;
 784         size_t          l2df_size;
 785         void            (*l2df_func)(void *, size_t);
 786         list_node_t     l2df_list_node;
 787 } l2arc_data_free_t;
 788
 789 static kmutex_t l2arc_feed_thr_lock;
 790 static kcondvar_t l2arc_feed_thr_cv;
 791 static uint8_t l2arc_thread_exit;
 792
 793 static void arc_get_data_buf(arc_buf_t *);
 794 static void arc_access(arc_buf_hdr_t *, kmutex_t *);
 795 static boolean_t arc_is_overflowing(void);
 796 static void arc_buf_watch(arc_buf_t *);
 797 static void arc_tuning_update(void);
 798
 799 static arc_buf_contents_t arc_buf_type(arc_buf_hdr_t *);
 800 static uint32_t arc_bufc_to_flags(arc_buf_contents_t);
 801
 802 static boolean_t l2arc_write_eligible(uint64_t, arc_buf_hdr_t *);
 803 static void l2arc_read_done(zio_t *);
 804
 805 static boolean_t l2arc_compress_buf(arc_buf_hdr_t *);
 806 static void l2arc_decompress_zio(zio_t *, arc_buf_hdr_t *, enum zio_compress);
 807 static void l2arc_release_cdata_buf(arc_buf_hdr_t *);
 808
 809 static uint64_t
 810 buf_hash(uint64_t spa, const dva_t *dva, uint64_t birth)
 811 {
 812         uint8_t *vdva = (uint8_t *)dva;
 813         uint64_t crc = -1ULL;
 814         int i;
 815
 816         ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY);
 817
 818         for (i = 0; i < sizeof (dva_t); i++)
 819                 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ vdva[i]) & 0xFF];
 820
 821         crc ^= (spa>>8) ^ birth;
 822
 823         return (crc);
 824 }
 825
 826 #define BUF_EMPTY(buf)                                          \
 827         ((buf)->b_dva.dva_word[0] == 0 &&                       \
 828         (buf)->b_dva.dva_word[1] == 0)
 829
 830 #define BUF_EQUAL(spa, dva, birth, buf)                         \
 831         ((buf)->b_dva.dva_word[0] == (dva)->dva_word[0]) &&     \
 832         ((buf)->b_dva.dva_word[1] == (dva)->dva_word[1]) &&     \
 833         ((buf)->b_birth == birth) && ((buf)->b_spa == spa)
 834
 835 static void
 836 buf_discard_identity(arc_buf_hdr_t *hdr)
 837 {
 838         hdr->b_dva.dva_word[0] = 0;
 839         hdr->b_dva.dva_word[1] = 0;
 840         hdr->b_birth = 0;
 841 }
 842
 843 static arc_buf_hdr_t *
 844 buf_hash_find(uint64_t spa, const blkptr_t *bp, kmutex_t **lockp)
 845 {
 846         const dva_t *dva = BP_IDENTITY(bp);
 847         uint64_t birth = BP_PHYSICAL_BIRTH(bp);
 848         uint64_t idx = BUF_HASH_INDEX(spa, dva, birth);
 849         kmutex_t *hash_lock = BUF_HASH_LOCK(idx);
 850         arc_buf_hdr_t *hdr;
 851
 852         mutex_enter(hash_lock);
 853         for (hdr = buf_hash_table.ht_table[idx]; hdr != NULL;
 854             hdr = hdr->b_hash_next) {
 855                 if (BUF_EQUAL(spa, dva, birth, hdr)) {
 856                         *lockp = hash_lock;
 857                         return (hdr);
 858                 }
 859         }
 860         mutex_exit(hash_lock);
 861         *lockp = NULL;
 862         return (NULL);
 863 }
 864
 865 /*
 866  * Insert an entry into the hash table.  If there is already an element
 867  * equal to elem in the hash table, then the already existing element
 868  * will be returned and the new element will not be inserted.
 869  * Otherwise returns NULL.
 870  * If lockp == NULL, the caller is assumed to already hold the hash lock.
 871  */
 872 static arc_buf_hdr_t *
 873 buf_hash_insert(arc_buf_hdr_t *hdr, kmutex_t **lockp)
 874 {
 875         uint64_t idx = BUF_HASH_INDEX(hdr->b_spa, &hdr->b_dva, hdr->b_birth);
 876         kmutex_t *hash_lock = BUF_HASH_LOCK(idx);
 877         arc_buf_hdr_t *fhdr;
 878         uint32_t i;
 879
 880         ASSERT(!DVA_IS_EMPTY(&hdr->b_dva));
 881         ASSERT(hdr->b_birth != 0);
 882         ASSERT(!HDR_IN_HASH_TABLE(hdr));
 883
 884         if (lockp != NULL) {
 885                 *lockp = hash_lock;
 886                 mutex_enter(hash_lock);
 887         } else {
 888                 ASSERT(MUTEX_HELD(hash_lock));
 889         }
 890
 891         for (fhdr = buf_hash_table.ht_table[idx], i = 0; fhdr != NULL;
 892             fhdr = fhdr->b_hash_next, i++) {
 893                 if (BUF_EQUAL(hdr->b_spa, &hdr->b_dva, hdr->b_birth, fhdr))
 894                         return (fhdr);
 895         }
 896
 897         hdr->b_hash_next = buf_hash_table.ht_table[idx];
 898         buf_hash_table.ht_table[idx] = hdr;
 899         hdr->b_flags |= ARC_FLAG_IN_HASH_TABLE;
 900
 901         /* collect some hash table performance data */
 902         if (i > 0) {
 903                 ARCSTAT_BUMP(arcstat_hash_collisions);
 904                 if (i == 1)
 905                         ARCSTAT_BUMP(arcstat_hash_chains);
 906
 907                 ARCSTAT_MAX(arcstat_hash_chain_max, i);
 908         }
 909
 910         ARCSTAT_BUMP(arcstat_hash_elements);
 911         ARCSTAT_MAXSTAT(arcstat_hash_elements);
 912
 913         return (NULL);
 914 }
 915
 916 static void
 917 buf_hash_remove(arc_buf_hdr_t *hdr)
 918 {
 919         arc_buf_hdr_t *fhdr, **hdrp;
 920         uint64_t idx = BUF_HASH_INDEX(hdr->b_spa, &hdr->b_dva, hdr->b_birth);
 921
 922         ASSERT(MUTEX_HELD(BUF_HASH_LOCK(idx)));
 923         ASSERT(HDR_IN_HASH_TABLE(hdr));
 924
 925         hdrp = &buf_hash_table.ht_table[idx];
 926         while ((fhdr = *hdrp) != hdr) {
 927                 ASSERT(fhdr != NULL);
 928                 hdrp = &fhdr->b_hash_next;
 929         }
 930         *hdrp = hdr->b_hash_next;
 931         hdr->b_hash_next = NULL;
 932         hdr->b_flags &= ~ARC_FLAG_IN_HASH_TABLE;
 933
 934         /* collect some hash table performance data */
 935         ARCSTAT_BUMPDOWN(arcstat_hash_elements);
 936
 937         if (buf_hash_table.ht_table[idx] &&
 938             buf_hash_table.ht_table[idx]->b_hash_next == NULL)
 939                 ARCSTAT_BUMPDOWN(arcstat_hash_chains);
 940 }
 941
 942 /*
 943  * Global data structures and functions for the buf kmem cache.
 944  */
 945 static kmem_cache_t *hdr_full_cache;
 946 static kmem_cache_t *hdr_l2only_cache;
 947 static kmem_cache_t *buf_cache;
 948
 949 static void
 950 buf_fini(void)
 951 {
 952         int i;
 953
 954 #if defined(_KERNEL) && defined(HAVE_SPL)
 955         /*
 956          * Large allocations which do not require contiguous pages
 957          * should be using vmem_free() in the linux kernel\
 958          */
 959         vmem_free(buf_hash_table.ht_table,
 960             (buf_hash_table.ht_mask + 1) * sizeof (void *));
 961 #else
 962         kmem_free(buf_hash_table.ht_table,
 963             (buf_hash_table.ht_mask + 1) * sizeof (void *));
 964 #endif
 965         for (i = 0; i < BUF_LOCKS; i++)
 966                 mutex_destroy(&buf_hash_table.ht_locks[i].ht_lock);
 967         kmem_cache_destroy(hdr_full_cache);
 968         kmem_cache_destroy(hdr_l2only_cache);
 969         kmem_cache_destroy(buf_cache);
 970 }
 971
 972 /*
 973  * Constructor callback - called when the cache is empty
 974  * and a new buf is requested.
 975  */
 976 /* ARGSUSED */
 977 static int
 978 hdr_full_cons(void *vbuf, void *unused, int kmflag)
 979 {
 980         arc_buf_hdr_t *hdr = vbuf;
 981
 982         bzero(hdr, HDR_FULL_SIZE);
 983         cv_init(&hdr->b_l1hdr.b_cv, NULL, CV_DEFAULT, NULL);
 984         refcount_create(&hdr->b_l1hdr.b_refcnt);
 985         mutex_init(&hdr->b_l1hdr.b_freeze_lock, NULL, MUTEX_DEFAULT, NULL);
 986         list_link_init(&hdr->b_l1hdr.b_arc_node);
 987         list_link_init(&hdr->b_l2hdr.b_l2node);
 988         multilist_link_init(&hdr->b_l1hdr.b_arc_node);
 989         arc_space_consume(HDR_FULL_SIZE, ARC_SPACE_HDRS);
 990
 991         return (0);
 992 }
 993
 994 /* ARGSUSED */
 995 static int
 996 hdr_l2only_cons(void *vbuf, void *unused, int kmflag)
 997 {
 998         arc_buf_hdr_t *hdr = vbuf;
 999
1000         bzero(hdr, HDR_L2ONLY_SIZE);
1001         arc_space_consume(HDR_L2ONLY_SIZE, ARC_SPACE_L2HDRS);
1002
1003         return (0);
1004 }
1005
1006 /* ARGSUSED */
1007 static int
1008 buf_cons(void *vbuf, void *unused, int kmflag)
1009 {
1010         arc_buf_t *buf = vbuf;
1011
1012         bzero(buf, sizeof (arc_buf_t));
1013         mutex_init(&buf->b_evict_lock, NULL, MUTEX_DEFAULT, NULL);
1014         arc_space_consume(sizeof (arc_buf_t), ARC_SPACE_HDRS);
1015
1016         return (0);
1017 }
1018
1019 /*
1020  * Destructor callback - called when a cached buf is
1021  * no longer required.
1022  */
1023 /* ARGSUSED */
1024 static void
1025 hdr_full_dest(void *vbuf, void *unused)
1026 {
1027         arc_buf_hdr_t *hdr = vbuf;
1028
1029         ASSERT(BUF_EMPTY(hdr));
1030         cv_destroy(&hdr->b_l1hdr.b_cv);
1031         refcount_destroy(&hdr->b_l1hdr.b_refcnt);
1032         mutex_destroy(&hdr->b_l1hdr.b_freeze_lock);
1033         ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node));
1034         arc_space_return(HDR_FULL_SIZE, ARC_SPACE_HDRS);
1035 }
1036
1037 /* ARGSUSED */
1038 static void
1039 hdr_l2only_dest(void *vbuf, void *unused)
1040 {
1041         ASSERTV(arc_buf_hdr_t *hdr = vbuf);
1042
1043         ASSERT(BUF_EMPTY(hdr));
1044         arc_space_return(HDR_L2ONLY_SIZE, ARC_SPACE_L2HDRS);
1045 }
1046
1047 /* ARGSUSED */
1048 static void
1049 buf_dest(void *vbuf, void *unused)
1050 {
1051         arc_buf_t *buf = vbuf;
1052
1053         mutex_destroy(&buf->b_evict_lock);
1054         arc_space_return(sizeof (arc_buf_t), ARC_SPACE_HDRS);
1055 }
1056
1057 /*
1058  * Reclaim callback -- invoked when memory is low.
1059  */
1060 /* ARGSUSED */
1061 static void
1062 hdr_recl(void *unused)
1063 {
1064         dprintf("hdr_recl called\n");
1065         /*
1066          * umem calls the reclaim func when we destroy the buf cache,
1067          * which is after we do arc_fini().
1068          */
1069         if (!arc_dead)
1070                 cv_signal(&arc_reclaim_thread_cv);
1071 }
1072
1073 static void
1074 buf_init(void)
1075 {
1076         uint64_t *ct;
1077         uint64_t hsize = 1ULL << 12;
1078         int i, j;
1079
1080         /*
1081          * The hash table is big enough to fill all of physical memory
1082          * with an average block size of zfs_arc_average_blocksize (default 8K).
1083          * By default, the table will take up
1084          * totalmem * sizeof(void*) / 8K (1MB per GB with 8-byte pointers).
1085          */
1086         while (hsize * zfs_arc_average_blocksize < physmem * PAGESIZE)
1087                 hsize <<= 1;
1088 retry:
1089         buf_hash_table.ht_mask = hsize - 1;
1090 #if defined(_KERNEL) && defined(HAVE_SPL)
1091         /*
1092          * Large allocations which do not require contiguous pages
1093          * should be using vmem_alloc() in the linux kernel
1094          */
1095         buf_hash_table.ht_table =
1096             vmem_zalloc(hsize * sizeof (void*), KM_SLEEP);
1097 #else
1098         buf_hash_table.ht_table =
1099             kmem_zalloc(hsize * sizeof (void*), KM_NOSLEEP);
1100 #endif
1101         if (buf_hash_table.ht_table == NULL) {
1102                 ASSERT(hsize > (1ULL << 8));
1103                 hsize >>= 1;
1104                 goto retry;
1105         }
1106
1107         hdr_full_cache = kmem_cache_create("arc_buf_hdr_t_full", HDR_FULL_SIZE,
1108             0, hdr_full_cons, hdr_full_dest, hdr_recl, NULL, NULL, 0);
1109         hdr_l2only_cache = kmem_cache_create("arc_buf_hdr_t_l2only",
1110             HDR_L2ONLY_SIZE, 0, hdr_l2only_cons, hdr_l2only_dest, hdr_recl,
1111             NULL, NULL, 0);
1112         buf_cache = kmem_cache_create("arc_buf_t", sizeof (arc_buf_t),
1113             0, buf_cons, buf_dest, NULL, NULL, NULL, 0);
1114
1115         for (i = 0; i < 256; i++)
1116                 for (ct = zfs_crc64_table + i, *ct = i, j = 8; j > 0; j--)
1117                         *ct = (*ct >> 1) ^ (-(*ct & 1) & ZFS_CRC64_POLY);
1118
1119         for (i = 0; i < BUF_LOCKS; i++) {
1120                 mutex_init(&buf_hash_table.ht_locks[i].ht_lock,
1121                     NULL, MUTEX_DEFAULT, NULL);
1122         }
1123 }
1124
1125 /*
1126  * Transition between the two allocation states for the arc_buf_hdr struct.
1127  * The arc_buf_hdr struct can be allocated with (hdr_full_cache) or without
1128  * (hdr_l2only_cache) the fields necessary for the L1 cache - the smaller
1129  * version is used when a cache buffer is only in the L2ARC in order to reduce
1130  * memory usage.
1131  */
1132 static arc_buf_hdr_t *
1133 arc_hdr_realloc(arc_buf_hdr_t *hdr, kmem_cache_t *old, kmem_cache_t *new)
1134 {
1135         arc_buf_hdr_t *nhdr;
1136         l2arc_dev_t *dev;
1137
1138         ASSERT(HDR_HAS_L2HDR(hdr));
1139         ASSERT((old == hdr_full_cache && new == hdr_l2only_cache) ||
1140             (old == hdr_l2only_cache && new == hdr_full_cache));
1141
1142         dev = hdr->b_l2hdr.b_dev;
1143         nhdr = kmem_cache_alloc(new, KM_PUSHPAGE);
1144
1145         ASSERT(MUTEX_HELD(HDR_LOCK(hdr)));
1146         buf_hash_remove(hdr);
1147
1148         bcopy(hdr, nhdr, HDR_L2ONLY_SIZE);
1149
1150         if (new == hdr_full_cache) {
1151                 nhdr->b_flags |= ARC_FLAG_HAS_L1HDR;
1152                 /*
1153                  * arc_access and arc_change_state need to be aware that a
1154                  * header has just come out of L2ARC, so we set its state to
1155                  * l2c_only even though it's about to change.
1156                  */
1157                 nhdr->b_l1hdr.b_state = arc_l2c_only;
1158
1159                 /* Verify previous threads set to NULL before freeing */
1160                 ASSERT3P(nhdr->b_l1hdr.b_tmp_cdata, ==, NULL);
1161         } else {
1162                 ASSERT(hdr->b_l1hdr.b_buf == NULL);
1163                 ASSERT0(hdr->b_l1hdr.b_datacnt);
1164
1165                 /*
1166                  * If we've reached here, We must have been called from
1167                  * arc_evict_hdr(), as such we should have already been
1168                  * removed from any ghost list we were previously on
1169                  * (which protects us from racing with arc_evict_state),
1170                  * thus no locking is needed during this check.
1171                  */
1172                 ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node));
1173
1174                 /*
1175                  * A buffer must not be moved into the arc_l2c_only
1176                  * state if it's not finished being written out to the
1177                  * l2arc device. Otherwise, the b_l1hdr.b_tmp_cdata field
1178                  * might try to be accessed, even though it was removed.
1179                  */
1180                 VERIFY(!HDR_L2_WRITING(hdr));
1181                 VERIFY3P(hdr->b_l1hdr.b_tmp_cdata, ==, NULL);
1182
1183                 nhdr->b_flags &= ~ARC_FLAG_HAS_L1HDR;
1184         }
1185         /*
1186          * The header has been reallocated so we need to re-insert it into any
1187          * lists it was on.
1188          */
1189         (void) buf_hash_insert(nhdr, NULL);
1190
1191         ASSERT(list_link_active(&hdr->b_l2hdr.b_l2node));
1192
1193         mutex_enter(&dev->l2ad_mtx);
1194
1195         /*
1196          * We must place the realloc'ed header back into the list at
1197          * the same spot. Otherwise, if it's placed earlier in the list,
1198          * l2arc_write_buffers() could find it during the function's
1199          * write phase, and try to write it out to the l2arc.
1200          */
1201         list_insert_after(&dev->l2ad_buflist, hdr, nhdr);
1202         list_remove(&dev->l2ad_buflist, hdr);
1203
1204         mutex_exit(&dev->l2ad_mtx);
1205
1206         /*
1207          * Since we're using the pointer address as the tag when
1208          * incrementing and decrementing the l2ad_alloc refcount, we
1209          * must remove the old pointer (that we're about to destroy) and
1210          * add the new pointer to the refcount. Otherwise we'd remove
1211          * the wrong pointer address when calling arc_hdr_destroy() later.
1212          */
1213
1214         (void) refcount_remove_many(&dev->l2ad_alloc,
1215             hdr->b_l2hdr.b_asize, hdr);
1216
1217         (void) refcount_add_many(&dev->l2ad_alloc,
1218             nhdr->b_l2hdr.b_asize, nhdr);
1219
1220         buf_discard_identity(hdr);
1221         hdr->b_freeze_cksum = NULL;
1222         kmem_cache_free(old, hdr);
1223
1224         return (nhdr);
1225 }
1226
1227
1228 #define ARC_MINTIME     (hz>>4) /* 62 ms */
1229
1230 static void
1231 arc_cksum_verify(arc_buf_t *buf)
1232 {
1233         zio_cksum_t zc;
1234
1235         if (!(zfs_flags & ZFS_DEBUG_MODIFY))
1236                 return;
1237
1238         mutex_enter(&buf->b_hdr->b_l1hdr.b_freeze_lock);
1239         if (buf->b_hdr->b_freeze_cksum == NULL || HDR_IO_ERROR(buf->b_hdr)) {
1240                 mutex_exit(&buf->b_hdr->b_l1hdr.b_freeze_lock);
1241                 return;
1242         }
1243         fletcher_2_native(buf->b_data, buf->b_hdr->b_size, &zc);
1244         if (!ZIO_CHECKSUM_EQUAL(*buf->b_hdr->b_freeze_cksum, zc))
1245                 panic("buffer modified while frozen!");
1246         mutex_exit(&buf->b_hdr->b_l1hdr.b_freeze_lock);
1247 }
1248
1249 static int
1250 arc_cksum_equal(arc_buf_t *buf)
1251 {
1252         zio_cksum_t zc;
1253         int equal;
1254
1255         mutex_enter(&buf->b_hdr->b_l1hdr.b_freeze_lock);
1256         fletcher_2_native(buf->b_data, buf->b_hdr->b_size, &zc);
1257         equal = ZIO_CHECKSUM_EQUAL(*buf->b_hdr->b_freeze_cksum, zc);
1258         mutex_exit(&buf->b_hdr->b_l1hdr.b_freeze_lock);
1259
1260         return (equal);
1261 }
1262
1263 static void
1264 arc_cksum_compute(arc_buf_t *buf, boolean_t force)
1265 {
1266         if (!force && !(zfs_flags & ZFS_DEBUG_MODIFY))
1267                 return;
1268
1269         mutex_enter(&buf->b_hdr->b_l1hdr.b_freeze_lock);
1270         if (buf->b_hdr->b_freeze_cksum != NULL) {
1271                 mutex_exit(&buf->b_hdr->b_l1hdr.b_freeze_lock);
1272                 return;
1273         }
1274         buf->b_hdr->b_freeze_cksum = kmem_alloc(sizeof (zio_cksum_t),
1275             KM_SLEEP);
1276         fletcher_2_native(buf->b_data, buf->b_hdr->b_size,
1277             buf->b_hdr->b_freeze_cksum);
1278         mutex_exit(&buf->b_hdr->b_l1hdr.b_freeze_lock);
1279         arc_buf_watch(buf);
1280 }
1281
1282 #ifndef _KERNEL
1283 void
1284 arc_buf_sigsegv(int sig, siginfo_t *si, void *unused)
1285 {
1286         panic("Got SIGSEGV at address: 0x%lx\n", (long) si->si_addr);
1287 }
1288 #endif
1289
1290 /* ARGSUSED */
1291 static void
1292 arc_buf_unwatch(arc_buf_t *buf)
1293 {
1294 #ifndef _KERNEL
1295         if (arc_watch) {
1296                 ASSERT0(mprotect(buf->b_data, buf->b_hdr->b_size,
1297                     PROT_READ | PROT_WRITE));
1298         }
1299 #endif
1300 }
1301
1302 /* ARGSUSED */
1303 static void
1304 arc_buf_watch(arc_buf_t *buf)
1305 {
1306 #ifndef _KERNEL
1307         if (arc_watch)
1308                 ASSERT0(mprotect(buf->b_data, buf->b_hdr->b_size, PROT_READ));
1309 #endif
1310 }
1311
1312 static arc_buf_contents_t
1313 arc_buf_type(arc_buf_hdr_t *hdr)
1314 {
1315         if (HDR_ISTYPE_METADATA(hdr)) {
1316                 return (ARC_BUFC_METADATA);
1317         } else {
1318                 return (ARC_BUFC_DATA);
1319         }
1320 }
1321
1322 static uint32_t
1323 arc_bufc_to_flags(arc_buf_contents_t type)
1324 {
1325         switch (type) {
1326         case ARC_BUFC_DATA:
1327                 /* metadata field is 0 if buffer contains normal data */
1328                 return (0);
1329         case ARC_BUFC_METADATA:
1330                 return (ARC_FLAG_BUFC_METADATA);
1331         default:
1332                 break;
1333         }
1334         panic("undefined ARC buffer type!");
1335         return ((uint32_t)-1);
1336 }
1337
1338 void
1339 arc_buf_thaw(arc_buf_t *buf)
1340 {
1341         if (zfs_flags & ZFS_DEBUG_MODIFY) {
1342                 if (buf->b_hdr->b_l1hdr.b_state != arc_anon)
1343                         panic("modifying non-anon buffer!");
1344                 if (HDR_IO_IN_PROGRESS(buf->b_hdr))
1345                         panic("modifying buffer while i/o in progress!");
1346                 arc_cksum_verify(buf);
1347         }
1348
1349         mutex_enter(&buf->b_hdr->b_l1hdr.b_freeze_lock);
1350         if (buf->b_hdr->b_freeze_cksum != NULL) {
1351                 kmem_free(buf->b_hdr->b_freeze_cksum, sizeof (zio_cksum_t));
1352                 buf->b_hdr->b_freeze_cksum = NULL;
1353         }
1354
1355         mutex_exit(&buf->b_hdr->b_l1hdr.b_freeze_lock);
1356
1357         arc_buf_unwatch(buf);
1358 }
1359
1360 void
1361 arc_buf_freeze(arc_buf_t *buf)
1362 {
1363         kmutex_t *hash_lock;
1364
1365         if (!(zfs_flags & ZFS_DEBUG_MODIFY))
1366                 return;
1367
1368         hash_lock = HDR_LOCK(buf->b_hdr);
1369         mutex_enter(hash_lock);
1370
1371         ASSERT(buf->b_hdr->b_freeze_cksum != NULL ||
1372             buf->b_hdr->b_l1hdr.b_state == arc_anon);
1373         arc_cksum_compute(buf, B_FALSE);
1374         mutex_exit(hash_lock);
1375
1376 }
1377
1378 static void
1379 add_reference(arc_buf_hdr_t *hdr, kmutex_t *hash_lock, void *tag)
1380 {
1381         arc_state_t *state;
1382
1383         ASSERT(HDR_HAS_L1HDR(hdr));
1384         ASSERT(MUTEX_HELD(hash_lock));
1385
1386         state = hdr->b_l1hdr.b_state;
1387
1388         if ((refcount_add(&hdr->b_l1hdr.b_refcnt, tag) == 1) &&
1389             (state != arc_anon)) {
1390                 /* We don't use the L2-only state list. */
1391                 if (state != arc_l2c_only) {
1392                         arc_buf_contents_t type = arc_buf_type(hdr);
1393                         uint64_t delta = hdr->b_size * hdr->b_l1hdr.b_datacnt;
1394                         multilist_t *list = &state->arcs_list[type];
1395                         uint64_t *size = &state->arcs_lsize[type];
1396
1397                         multilist_remove(list, hdr);
1398
1399                         if (GHOST_STATE(state)) {
1400                                 ASSERT0(hdr->b_l1hdr.b_datacnt);
1401                                 ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL);
1402                                 delta = hdr->b_size;
1403                         }
1404                         ASSERT(delta > 0);
1405                         ASSERT3U(*size, >=, delta);
1406                         atomic_add_64(size, -delta);
1407                 }
1408                 /* remove the prefetch flag if we get a reference */
1409                 hdr->b_flags &= ~ARC_FLAG_PREFETCH;
1410         }
1411 }
1412
1413 static int
1414 remove_reference(arc_buf_hdr_t *hdr, kmutex_t *hash_lock, void *tag)
1415 {
1416         int cnt;
1417         arc_state_t *state = hdr->b_l1hdr.b_state;
1418
1419         ASSERT(HDR_HAS_L1HDR(hdr));
1420         ASSERT(state == arc_anon || MUTEX_HELD(hash_lock));
1421         ASSERT(!GHOST_STATE(state));
1422
1423         /*
1424          * arc_l2c_only counts as a ghost state so we don't need to explicitly
1425          * check to prevent usage of the arc_l2c_only list.
1426          */
1427         if (((cnt = refcount_remove(&hdr->b_l1hdr.b_refcnt, tag)) == 0) &&
1428             (state != arc_anon)) {
1429                 arc_buf_contents_t type = arc_buf_type(hdr);
1430                 multilist_t *list = &state->arcs_list[type];
1431                 uint64_t *size = &state->arcs_lsize[type];
1432
1433                 multilist_insert(list, hdr);
1434
1435                 ASSERT(hdr->b_l1hdr.b_datacnt > 0);
1436                 atomic_add_64(size, hdr->b_size *
1437                     hdr->b_l1hdr.b_datacnt);
1438         }
1439         return (cnt);
1440 }
1441
1442 /*
1443  * Returns detailed information about a specific arc buffer.  When the
1444  * state_index argument is set the function will calculate the arc header
1445  * list position for its arc state.  Since this requires a linear traversal
1446  * callers are strongly encourage not to do this.  However, it can be helpful
1447  * for targeted analysis so the functionality is provided.
1448  */
1449 void
1450 arc_buf_info(arc_buf_t *ab, arc_buf_info_t *abi, int state_index)
1451 {
1452         arc_buf_hdr_t *hdr = ab->b_hdr;
1453         l1arc_buf_hdr_t *l1hdr = NULL;
1454         l2arc_buf_hdr_t *l2hdr = NULL;
1455         arc_state_t *state = NULL;
1456
1457         if (HDR_HAS_L1HDR(hdr)) {
1458                 l1hdr = &hdr->b_l1hdr;
1459                 state = l1hdr->b_state;
1460         }
1461         if (HDR_HAS_L2HDR(hdr))
1462                 l2hdr = &hdr->b_l2hdr;
1463
1464         memset(abi, 0, sizeof (arc_buf_info_t));
1465         abi->abi_flags = hdr->b_flags;
1466
1467         if (l1hdr) {
1468                 abi->abi_datacnt = l1hdr->b_datacnt;
1469                 abi->abi_access = l1hdr->b_arc_access;
1470                 abi->abi_mru_hits = l1hdr->b_mru_hits;
1471                 abi->abi_mru_ghost_hits = l1hdr->b_mru_ghost_hits;
1472                 abi->abi_mfu_hits = l1hdr->b_mfu_hits;
1473                 abi->abi_mfu_ghost_hits = l1hdr->b_mfu_ghost_hits;
1474                 abi->abi_holds = refcount_count(&l1hdr->b_refcnt);
1475         }
1476
1477         if (l2hdr) {
1478                 abi->abi_l2arc_dattr = l2hdr->b_daddr;
1479                 abi->abi_l2arc_asize = l2hdr->b_asize;
1480                 abi->abi_l2arc_compress = HDR_GET_COMPRESS(hdr);
1481                 abi->abi_l2arc_hits = l2hdr->b_hits;
1482         }
1483
1484         abi->abi_state_type = state ? state->arcs_state : ARC_STATE_ANON;
1485         abi->abi_state_contents = arc_buf_type(hdr);
1486         abi->abi_size = hdr->b_size;
1487 }
1488
1489 /*
1490  * Move the supplied buffer to the indicated state. The hash lock
1491  * for the buffer must be held by the caller.
1492  */
1493 static void
1494 arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr,
1495     kmutex_t *hash_lock)
1496 {
1497         arc_state_t *old_state;
1498         int64_t refcnt;
1499         uint32_t datacnt;
1500         uint64_t from_delta, to_delta;
1501         arc_buf_contents_t buftype = arc_buf_type(hdr);
1502
1503         /*
1504          * We almost always have an L1 hdr here, since we call arc_hdr_realloc()
1505          * in arc_read() when bringing a buffer out of the L2ARC.  However, the
1506          * L1 hdr doesn't always exist when we change state to arc_anon before
1507          * destroying a header, in which case reallocating to add the L1 hdr is
1508          * pointless.
1509          */
1510         if (HDR_HAS_L1HDR(hdr)) {
1511                 old_state = hdr->b_l1hdr.b_state;
1512                 refcnt = refcount_count(&hdr->b_l1hdr.b_refcnt);
1513                 datacnt = hdr->b_l1hdr.b_datacnt;
1514         } else {
1515                 old_state = arc_l2c_only;
1516                 refcnt = 0;
1517                 datacnt = 0;
1518         }
1519
1520         ASSERT(MUTEX_HELD(hash_lock));
1521         ASSERT3P(new_state, !=, old_state);
1522         ASSERT(refcnt == 0 || datacnt > 0);
1523         ASSERT(!GHOST_STATE(new_state) || datacnt == 0);
1524         ASSERT(old_state != arc_anon || datacnt <= 1);
1525
1526         from_delta = to_delta = datacnt * hdr->b_size;
1527
1528         /*
1529          * If this buffer is evictable, transfer it from the
1530          * old state list to the new state list.
1531          */
1532         if (refcnt == 0) {
1533                 if (old_state != arc_anon && old_state != arc_l2c_only) {
1534                         uint64_t *size = &old_state->arcs_lsize[buftype];
1535
1536                         ASSERT(HDR_HAS_L1HDR(hdr));
1537                         multilist_remove(&old_state->arcs_list[buftype], hdr);
1538
1539                         /*
1540                          * If prefetching out of the ghost cache,
1541                          * we will have a non-zero datacnt.
1542                          */
1543                         if (GHOST_STATE(old_state) && datacnt == 0) {
1544                                 /* ghost elements have a ghost size */
1545                                 ASSERT(hdr->b_l1hdr.b_buf == NULL);
1546                                 from_delta = hdr->b_size;
1547                         }
1548                         ASSERT3U(*size, >=, from_delta);
1549                         atomic_add_64(size, -from_delta);
1550                 }
1551                 if (new_state != arc_anon && new_state != arc_l2c_only) {
1552                         uint64_t *size = &new_state->arcs_lsize[buftype];
1553
1554                         /*
1555                          * An L1 header always exists here, since if we're
1556                          * moving to some L1-cached state (i.e. not l2c_only or
1557                          * anonymous), we realloc the header to add an L1hdr
1558                          * beforehand.
1559                          */
1560                         ASSERT(HDR_HAS_L1HDR(hdr));
1561                         multilist_insert(&new_state->arcs_list[buftype], hdr);
1562
1563                         /* ghost elements have a ghost size */
1564                         if (GHOST_STATE(new_state)) {
1565                                 ASSERT0(datacnt);
1566                                 ASSERT(hdr->b_l1hdr.b_buf == NULL);
1567                                 to_delta = hdr->b_size;
1568                         }
1569                         atomic_add_64(size, to_delta);
1570                 }
1571         }
1572
1573         ASSERT(!BUF_EMPTY(hdr));
1574         if (new_state == arc_anon && HDR_IN_HASH_TABLE(hdr))
1575                 buf_hash_remove(hdr);
1576
1577         /* adjust state sizes (ignore arc_l2c_only) */
1578
1579         if (to_delta && new_state != arc_l2c_only) {
1580                 ASSERT(HDR_HAS_L1HDR(hdr));
1581                 if (GHOST_STATE(new_state)) {
1582                         ASSERT0(datacnt);
1583
1584                         /*
1585                          * We moving a header to a ghost state, we first
1586                          * remove all arc buffers. Thus, we'll have a
1587                          * datacnt of zero, and no arc buffer to use for
1588                          * the reference. As a result, we use the arc
1589                          * header pointer for the reference.
1590                          */
1591                         (void) refcount_add_many(&new_state->arcs_size,
1592                             hdr->b_size, hdr);
1593                 } else {
1594                         arc_buf_t *buf;
1595                         ASSERT3U(datacnt, !=, 0);
1596
1597                         /*
1598                          * Each individual buffer holds a unique reference,
1599                          * thus we must remove each of these references one
1600                          * at a time.
1601                          */
1602                         for (buf = hdr->b_l1hdr.b_buf; buf != NULL;
1603                             buf = buf->b_next) {
1604                                 (void) refcount_add_many(&new_state->arcs_size,
1605                                     hdr->b_size, buf);
1606                         }
1607                 }
1608         }
1609
1610         if (from_delta && old_state != arc_l2c_only) {
1611                 ASSERT(HDR_HAS_L1HDR(hdr));
1612                 if (GHOST_STATE(old_state)) {
1613                         /*
1614                          * When moving a header off of a ghost state,
1615                          * there's the possibility for datacnt to be
1616                          * non-zero. This is because we first add the
1617                          * arc buffer to the header prior to changing
1618                          * the header's state. Since we used the header
1619                          * for the reference when putting the header on
1620                          * the ghost state, we must balance that and use
1621                          * the header when removing off the ghost state
1622                          * (even though datacnt is non zero).
1623                          */
1624
1625                         IMPLY(datacnt == 0, new_state == arc_anon ||
1626                             new_state == arc_l2c_only);
1627
1628                         (void) refcount_remove_many(&old_state->arcs_size,
1629                             hdr->b_size, hdr);
1630                 } else {
1631                         arc_buf_t *buf;
1632                         ASSERT3U(datacnt, !=, 0);
1633
1634                         /*
1635                          * Each individual buffer holds a unique reference,
1636                          * thus we must remove each of these references one
1637                          * at a time.
1638                          */
1639                         for (buf = hdr->b_l1hdr.b_buf; buf != NULL;
1640                             buf = buf->b_next) {
1641                                 (void) refcount_remove_many(
1642                                     &old_state->arcs_size, hdr->b_size, buf);
1643                         }
1644                 }
1645         }
1646
1647         if (HDR_HAS_L1HDR(hdr))
1648                 hdr->b_l1hdr.b_state = new_state;
1649
1650         /*
1651          * L2 headers should never be on the L2 state list since they don't
1652          * have L1 headers allocated.
1653          */
1654         ASSERT(multilist_is_empty(&arc_l2c_only->arcs_list[ARC_BUFC_DATA]) &&
1655             multilist_is_empty(&arc_l2c_only->arcs_list[ARC_BUFC_METADATA]));
1656 }
1657
1658 void
1659 arc_space_consume(uint64_t space, arc_space_type_t type)
1660 {
1661         ASSERT(type >= 0 && type < ARC_SPACE_NUMTYPES);
1662
1663         switch (type) {
1664         default:
1665                 break;
1666         case ARC_SPACE_DATA:
1667                 ARCSTAT_INCR(arcstat_data_size, space);
1668                 break;
1669         case ARC_SPACE_META:
1670                 ARCSTAT_INCR(arcstat_metadata_size, space);
1671                 break;
1672         case ARC_SPACE_OTHER:
1673                 ARCSTAT_INCR(arcstat_other_size, space);
1674                 break;
1675         case ARC_SPACE_HDRS:
1676                 ARCSTAT_INCR(arcstat_hdr_size, space);
1677                 break;
1678         case ARC_SPACE_L2HDRS:
1679                 ARCSTAT_INCR(arcstat_l2_hdr_size, space);
1680                 break;
1681         }
1682
1683         if (type != ARC_SPACE_DATA)
1684                 ARCSTAT_INCR(arcstat_meta_used, space);
1685
1686         atomic_add_64(&arc_size, space);
1687 }
1688
1689 void
1690 arc_space_return(uint64_t space, arc_space_type_t type)
1691 {
1692         ASSERT(type >= 0 && type < ARC_SPACE_NUMTYPES);
1693
1694         switch (type) {
1695         default:
1696                 break;
1697         case ARC_SPACE_DATA:
1698                 ARCSTAT_INCR(arcstat_data_size, -space);
1699                 break;
1700         case ARC_SPACE_META:
1701                 ARCSTAT_INCR(arcstat_metadata_size, -space);
1702                 break;
1703         case ARC_SPACE_OTHER:
1704                 ARCSTAT_INCR(arcstat_other_size, -space);
1705                 break;
1706         case ARC_SPACE_HDRS:
1707                 ARCSTAT_INCR(arcstat_hdr_size, -space);
1708                 break;
1709         case ARC_SPACE_L2HDRS:
1710                 ARCSTAT_INCR(arcstat_l2_hdr_size, -space);
1711                 break;
1712         }
1713
1714         if (type != ARC_SPACE_DATA) {
1715                 ASSERT(arc_meta_used >= space);
1716                 if (arc_meta_max < arc_meta_used)
1717                         arc_meta_max = arc_meta_used;
1718                 ARCSTAT_INCR(arcstat_meta_used, -space);
1719         }
1720
1721         ASSERT(arc_size >= space);
1722         atomic_add_64(&arc_size, -space);
1723 }
1724
1725 arc_buf_t *
1726 arc_buf_alloc(spa_t *spa, uint64_t size, void *tag, arc_buf_contents_t type)
1727 {
1728         arc_buf_hdr_t *hdr;
1729         arc_buf_t *buf;
1730
1731         VERIFY3U(size, <=, spa_maxblocksize(spa));
1732         hdr = kmem_cache_alloc(hdr_full_cache, KM_PUSHPAGE);
1733         ASSERT(BUF_EMPTY(hdr));
1734         ASSERT3P(hdr->b_freeze_cksum, ==, NULL);
1735         hdr->b_size = size;
1736         hdr->b_spa = spa_load_guid(spa);
1737         hdr->b_l1hdr.b_mru_hits = 0;
1738         hdr->b_l1hdr.b_mru_ghost_hits = 0;
1739         hdr->b_l1hdr.b_mfu_hits = 0;
1740         hdr->b_l1hdr.b_mfu_ghost_hits = 0;
1741         hdr->b_l1hdr.b_l2_hits = 0;
1742
1743         buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE);
1744         buf->b_hdr = hdr;
1745         buf->b_data = NULL;
1746         buf->b_efunc = NULL;
1747         buf->b_private = NULL;
1748         buf->b_next = NULL;
1749
1750         hdr->b_flags = arc_bufc_to_flags(type);
1751         hdr->b_flags |= ARC_FLAG_HAS_L1HDR;
1752
1753         hdr->b_l1hdr.b_buf = buf;
1754         hdr->b_l1hdr.b_state = arc_anon;
1755         hdr->b_l1hdr.b_arc_access = 0;
1756         hdr->b_l1hdr.b_datacnt = 1;
1757         hdr->b_l1hdr.b_tmp_cdata = NULL;
1758
1759         arc_get_data_buf(buf);
1760
1761         ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
1762         (void) refcount_add(&hdr->b_l1hdr.b_refcnt, tag);
1763
1764         return (buf);
1765 }
1766
1767 static char *arc_onloan_tag = "onloan";
1768
1769 /*
1770  * Loan out an anonymous arc buffer. Loaned buffers are not counted as in
1771  * flight data by arc_tempreserve_space() until they are "returned". Loaned
1772  * buffers must be returned to the arc before they can be used by the DMU or
1773  * freed.
1774  */
1775 arc_buf_t *
1776 arc_loan_buf(spa_t *spa, uint64_t size)
1777 {
1778         arc_buf_t *buf;
1779
1780         buf = arc_buf_alloc(spa, size, arc_onloan_tag, ARC_BUFC_DATA);
1781
1782         atomic_add_64(&arc_loaned_bytes, size);
1783         return (buf);
1784 }
1785
1786 /*
1787  * Return a loaned arc buffer to the arc.
1788  */
1789 void
1790 arc_return_buf(arc_buf_t *buf, void *tag)
1791 {
1792         arc_buf_hdr_t *hdr = buf->b_hdr;
1793
1794         ASSERT(buf->b_data != NULL);
1795         ASSERT(HDR_HAS_L1HDR(hdr));
1796         (void) refcount_add(&hdr->b_l1hdr.b_refcnt, tag);
1797         (void) refcount_remove(&hdr->b_l1hdr.b_refcnt, arc_onloan_tag);
1798
1799         atomic_add_64(&arc_loaned_bytes, -hdr->b_size);
1800 }
1801
1802 /* Detach an arc_buf from a dbuf (tag) */
1803 void
1804 arc_loan_inuse_buf(arc_buf_t *buf, void *tag)
1805 {
1806         arc_buf_hdr_t *hdr = buf->b_hdr;
1807
1808         ASSERT(buf->b_data != NULL);
1809         ASSERT(HDR_HAS_L1HDR(hdr));
1810         (void) refcount_add(&hdr->b_l1hdr.b_refcnt, arc_onloan_tag);
1811         (void) refcount_remove(&hdr->b_l1hdr.b_refcnt, tag);
1812         buf->b_efunc = NULL;
1813         buf->b_private = NULL;
1814
1815         atomic_add_64(&arc_loaned_bytes, hdr->b_size);
1816 }
1817
1818 static arc_buf_t *
1819 arc_buf_clone(arc_buf_t *from)
1820 {
1821         arc_buf_t *buf;
1822         arc_buf_hdr_t *hdr = from->b_hdr;
1823         uint64_t size = hdr->b_size;
1824
1825         ASSERT(HDR_HAS_L1HDR(hdr));
1826         ASSERT(hdr->b_l1hdr.b_state != arc_anon);
1827
1828         buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE);
1829         buf->b_hdr = hdr;
1830         buf->b_data = NULL;
1831         buf->b_efunc = NULL;
1832         buf->b_private = NULL;
1833         buf->b_next = hdr->b_l1hdr.b_buf;
1834         hdr->b_l1hdr.b_buf = buf;
1835         arc_get_data_buf(buf);
1836         bcopy(from->b_data, buf->b_data, size);
1837
1838         /*
1839          * This buffer already exists in the arc so create a duplicate
1840          * copy for the caller.  If the buffer is associated with user data
1841          * then track the size and number of duplicates.  These stats will be
1842          * updated as duplicate buffers are created and destroyed.
1843          */
1844         if (HDR_ISTYPE_DATA(hdr)) {
1845                 ARCSTAT_BUMP(arcstat_duplicate_buffers);
1846                 ARCSTAT_INCR(arcstat_duplicate_buffers_size, size);
1847         }
1848         hdr->b_l1hdr.b_datacnt += 1;
1849         return (buf);
1850 }
1851
1852 void
1853 arc_buf_add_ref(arc_buf_t *buf, void* tag)
1854 {
1855         arc_buf_hdr_t *hdr;
1856         kmutex_t *hash_lock;
1857
1858         /*
1859          * Check to see if this buffer is evicted.  Callers
1860          * must verify b_data != NULL to know if the add_ref
1861          * was successful.
1862          */
1863         mutex_enter(&buf->b_evict_lock);
1864         if (buf->b_data == NULL) {
1865                 mutex_exit(&buf->b_evict_lock);
1866                 return;
1867         }
1868         hash_lock = HDR_LOCK(buf->b_hdr);
1869         mutex_enter(hash_lock);
1870         hdr = buf->b_hdr;
1871         ASSERT(HDR_HAS_L1HDR(hdr));
1872         ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
1873         mutex_exit(&buf->b_evict_lock);
1874
1875         ASSERT(hdr->b_l1hdr.b_state == arc_mru ||
1876             hdr->b_l1hdr.b_state == arc_mfu);
1877
1878         add_reference(hdr, hash_lock, tag);
1879         DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr);
1880         arc_access(hdr, hash_lock);
1881         mutex_exit(hash_lock);
1882         ARCSTAT_BUMP(arcstat_hits);
1883         ARCSTAT_CONDSTAT(!HDR_PREFETCH(hdr),
1884             demand, prefetch, !HDR_ISTYPE_METADATA(hdr),
1885             data, metadata, hits);
1886 }
1887
1888 static void
1889 arc_buf_free_on_write(void *data, size_t size,
1890     void (*free_func)(void *, size_t))
1891 {
1892         l2arc_data_free_t *df;
1893
1894         df = kmem_alloc(sizeof (*df), KM_SLEEP);
1895         df->l2df_data = data;
1896         df->l2df_size = size;
1897         df->l2df_func = free_func;
1898         mutex_enter(&l2arc_free_on_write_mtx);
1899         list_insert_head(l2arc_free_on_write, df);
1900         mutex_exit(&l2arc_free_on_write_mtx);
1901 }
1902
1903 /*
1904  * Free the arc data buffer.  If it is an l2arc write in progress,
1905  * the buffer is placed on l2arc_free_on_write to be freed later.
1906  */
1907 static void
1908 arc_buf_data_free(arc_buf_t *buf, void (*free_func)(void *, size_t))
1909 {
1910         arc_buf_hdr_t *hdr = buf->b_hdr;
1911
1912         if (HDR_L2_WRITING(hdr)) {
1913                 arc_buf_free_on_write(buf->b_data, hdr->b_size, free_func);
1914                 ARCSTAT_BUMP(arcstat_l2_free_on_write);
1915         } else {
1916                 free_func(buf->b_data, hdr->b_size);
1917         }
1918 }
1919
1920 static void
1921 arc_buf_l2_cdata_free(arc_buf_hdr_t *hdr)
1922 {
1923         ASSERT(HDR_HAS_L2HDR(hdr));
1924         ASSERT(MUTEX_HELD(&hdr->b_l2hdr.b_dev->l2ad_mtx));
1925
1926         /*
1927          * The b_tmp_cdata field is linked off of the b_l1hdr, so if
1928          * that doesn't exist, the header is in the arc_l2c_only state,
1929          * and there isn't anything to free (it's already been freed).
1930          */
1931         if (!HDR_HAS_L1HDR(hdr))
1932                 return;
1933
1934         /*
1935          * The header isn't being written to the l2arc device, thus it
1936          * shouldn't have a b_tmp_cdata to free.
1937          */
1938         if (!HDR_L2_WRITING(hdr)) {
1939                 ASSERT3P(hdr->b_l1hdr.b_tmp_cdata, ==, NULL);
1940                 return;
1941         }
1942
1943         /*
1944          * The header does not have compression enabled. This can be due
1945          * to the buffer not being compressible, or because we're
1946          * freeing the buffer before the second phase of
1947          * l2arc_write_buffer() has started (which does the compression
1948          * step). In either case, b_tmp_cdata does not point to a
1949          * separately compressed buffer, so there's nothing to free (it
1950          * points to the same buffer as the arc_buf_t's b_data field).
1951          */
1952         if (HDR_GET_COMPRESS(hdr) == ZIO_COMPRESS_OFF) {
1953                 hdr->b_l1hdr.b_tmp_cdata = NULL;
1954                 return;
1955         }
1956
1957         /*
1958          * There's nothing to free since the buffer was all zero's and
1959          * compressed to a zero length buffer.
1960          */
1961         if (HDR_GET_COMPRESS(hdr) == ZIO_COMPRESS_EMPTY) {
1962                 ASSERT3P(hdr->b_l1hdr.b_tmp_cdata, ==, NULL);
1963                 return;
1964         }
1965
1966         ASSERT(L2ARC_IS_VALID_COMPRESS(HDR_GET_COMPRESS(hdr)));
1967
1968         arc_buf_free_on_write(hdr->b_l1hdr.b_tmp_cdata,
1969             hdr->b_size, zio_data_buf_free);
1970
1971         ARCSTAT_BUMP(arcstat_l2_cdata_free_on_write);
1972         hdr->b_l1hdr.b_tmp_cdata = NULL;
1973 }
1974
1975 /*
1976  * Free up buf->b_data and if 'remove' is set, then pull the
1977  * arc_buf_t off of the the arc_buf_hdr_t's list and free it.
1978  */
1979 static void
1980 arc_buf_destroy(arc_buf_t *buf, boolean_t remove)
1981 {
1982         arc_buf_t **bufp;
1983
1984         /* free up data associated with the buf */
1985         if (buf->b_data != NULL) {
1986                 arc_state_t *state = buf->b_hdr->b_l1hdr.b_state;
1987                 uint64_t size = buf->b_hdr->b_size;
1988                 arc_buf_contents_t type = arc_buf_type(buf->b_hdr);
1989
1990                 arc_cksum_verify(buf);
1991                 arc_buf_unwatch(buf);
1992
1993                 if (type == ARC_BUFC_METADATA) {
1994                         arc_buf_data_free(buf, zio_buf_free);
1995                         arc_space_return(size, ARC_SPACE_META);
1996                 } else {
1997                         ASSERT(type == ARC_BUFC_DATA);
1998                         arc_buf_data_free(buf, zio_data_buf_free);
1999                         arc_space_return(size, ARC_SPACE_DATA);
2000                 }
2001
2002                 /* protected by hash lock, if in the hash table */
2003                 if (multilist_link_active(&buf->b_hdr->b_l1hdr.b_arc_node)) {
2004                         uint64_t *cnt = &state->arcs_lsize[type];
2005
2006                         ASSERT(refcount_is_zero(
2007                             &buf->b_hdr->b_l1hdr.b_refcnt));
2008                         ASSERT(state != arc_anon && state != arc_l2c_only);
2009
2010                         ASSERT3U(*cnt, >=, size);
2011                         atomic_add_64(cnt, -size);
2012                 }
2013
2014                 (void) refcount_remove_many(&state->arcs_size, size, buf);
2015                 buf->b_data = NULL;
2016
2017                 /*
2018                  * If we're destroying a duplicate buffer make sure
2019                  * that the appropriate statistics are updated.
2020                  */
2021                 if (buf->b_hdr->b_l1hdr.b_datacnt > 1 &&
2022                     HDR_ISTYPE_DATA(buf->b_hdr)) {
2023                         ARCSTAT_BUMPDOWN(arcstat_duplicate_buffers);
2024                         ARCSTAT_INCR(arcstat_duplicate_buffers_size, -size);
2025                 }
2026                 ASSERT(buf->b_hdr->b_l1hdr.b_datacnt > 0);
2027                 buf->b_hdr->b_l1hdr.b_datacnt -= 1;
2028         }
2029
2030         /* only remove the buf if requested */
2031         if (!remove)
2032                 return;
2033
2034         /* remove the buf from the hdr list */
2035         for (bufp = &buf->b_hdr->b_l1hdr.b_buf; *bufp != buf;
2036             bufp = &(*bufp)->b_next)
2037                 continue;
2038         *bufp = buf->b_next;
2039         buf->b_next = NULL;
2040
2041         ASSERT(buf->b_efunc == NULL);
2042
2043         /* clean up the buf */
2044         buf->b_hdr = NULL;
2045         kmem_cache_free(buf_cache, buf);
2046 }
2047
2048 static void
2049 arc_hdr_l2hdr_destroy(arc_buf_hdr_t *hdr)
2050 {
2051         l2arc_buf_hdr_t *l2hdr = &hdr->b_l2hdr;
2052         l2arc_dev_t *dev = l2hdr->b_dev;
2053
2054         ASSERT(MUTEX_HELD(&dev->l2ad_mtx));
2055         ASSERT(HDR_HAS_L2HDR(hdr));
2056
2057         list_remove(&dev->l2ad_buflist, hdr);
2058
2059         /*
2060          * We don't want to leak the b_tmp_cdata buffer that was
2061          * allocated in l2arc_write_buffers()
2062          */
2063         arc_buf_l2_cdata_free(hdr);
2064
2065         /*
2066          * If the l2hdr's b_daddr is equal to L2ARC_ADDR_UNSET, then
2067          * this header is being processed by l2arc_write_buffers() (i.e.
2068          * it's in the first stage of l2arc_write_buffers()).
2069          * Re-affirming that truth here, just to serve as a reminder. If
2070          * b_daddr does not equal L2ARC_ADDR_UNSET, then the header may or
2071          * may not have its HDR_L2_WRITING flag set. (the write may have
2072          * completed, in which case HDR_L2_WRITING will be false and the
2073          * b_daddr field will point to the address of the buffer on disk).
2074          */
2075         IMPLY(l2hdr->b_daddr == L2ARC_ADDR_UNSET, HDR_L2_WRITING(hdr));
2076
2077         /*
2078          * If b_daddr is equal to L2ARC_ADDR_UNSET, we're racing with
2079          * l2arc_write_buffers(). Since we've just removed this header
2080          * from the l2arc buffer list, this header will never reach the
2081          * second stage of l2arc_write_buffers(), which increments the
2082          * accounting stats for this header. Thus, we must be careful
2083          * not to decrement them for this header either.
2084          */
2085         if (l2hdr->b_daddr != L2ARC_ADDR_UNSET) {
2086                 ARCSTAT_INCR(arcstat_l2_asize, -l2hdr->b_asize);
2087                 ARCSTAT_INCR(arcstat_l2_size, -hdr->b_size);
2088
2089                 vdev_space_update(dev->l2ad_vdev,
2090                     -l2hdr->b_asize, 0, 0);
2091
2092                 (void) refcount_remove_many(&dev->l2ad_alloc,
2093                     l2hdr->b_asize, hdr);
2094         }
2095
2096         hdr->b_flags &= ~ARC_FLAG_HAS_L2HDR;
2097 }
2098
2099 static void
2100 arc_hdr_destroy(arc_buf_hdr_t *hdr)
2101 {
2102         if (HDR_HAS_L1HDR(hdr)) {
2103                 ASSERT(hdr->b_l1hdr.b_buf == NULL ||
2104                     hdr->b_l1hdr.b_datacnt > 0);
2105                 ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
2106                 ASSERT3P(hdr->b_l1hdr.b_state, ==, arc_anon);
2107         }
2108         ASSERT(!HDR_IO_IN_PROGRESS(hdr));
2109         ASSERT(!HDR_IN_HASH_TABLE(hdr));
2110
2111         if (HDR_HAS_L2HDR(hdr)) {
2112                 l2arc_dev_t *dev = hdr->b_l2hdr.b_dev;
2113                 boolean_t buflist_held = MUTEX_HELD(&dev->l2ad_mtx);
2114
2115                 if (!buflist_held)
2116                         mutex_enter(&dev->l2ad_mtx);
2117
2118                 /*
2119                  * Even though we checked this conditional above, we
2120                  * need to check this again now that we have the
2121                  * l2ad_mtx. This is because we could be racing with
2122                  * another thread calling l2arc_evict() which might have
2123                  * destroyed this header's L2 portion as we were waiting
2124                  * to acquire the l2ad_mtx. If that happens, we don't
2125                  * want to re-destroy the header's L2 portion.
2126                  */
2127                 if (HDR_HAS_L2HDR(hdr))
2128                         arc_hdr_l2hdr_destroy(hdr);
2129
2130                 if (!buflist_held)
2131                         mutex_exit(&dev->l2ad_mtx);
2132         }
2133
2134         if (!BUF_EMPTY(hdr))
2135                 buf_discard_identity(hdr);
2136
2137         if (hdr->b_freeze_cksum != NULL) {
2138                 kmem_free(hdr->b_freeze_cksum, sizeof (zio_cksum_t));
2139                 hdr->b_freeze_cksum = NULL;
2140         }
2141
2142         if (HDR_HAS_L1HDR(hdr)) {
2143                 while (hdr->b_l1hdr.b_buf) {
2144                         arc_buf_t *buf = hdr->b_l1hdr.b_buf;
2145
2146                         if (buf->b_efunc != NULL) {
2147                                 mutex_enter(&arc_user_evicts_lock);
2148                                 mutex_enter(&buf->b_evict_lock);
2149                                 ASSERT(buf->b_hdr != NULL);
2150                                 arc_buf_destroy(hdr->b_l1hdr.b_buf, FALSE);
2151                                 hdr->b_l1hdr.b_buf = buf->b_next;
2152                                 buf->b_hdr = &arc_eviction_hdr;
2153                                 buf->b_next = arc_eviction_list;
2154                                 arc_eviction_list = buf;
2155                                 mutex_exit(&buf->b_evict_lock);
2156                                 cv_signal(&arc_user_evicts_cv);
2157                                 mutex_exit(&arc_user_evicts_lock);
2158                         } else {
2159                                 arc_buf_destroy(hdr->b_l1hdr.b_buf, TRUE);
2160                         }
2161                 }
2162         }
2163
2164         ASSERT3P(hdr->b_hash_next, ==, NULL);
2165         if (HDR_HAS_L1HDR(hdr)) {
2166                 ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node));
2167                 ASSERT3P(hdr->b_l1hdr.b_acb, ==, NULL);
2168                 kmem_cache_free(hdr_full_cache, hdr);
2169         } else {
2170                 kmem_cache_free(hdr_l2only_cache, hdr);
2171         }
2172 }
2173
2174 void
2175 arc_buf_free(arc_buf_t *buf, void *tag)
2176 {
2177         arc_buf_hdr_t *hdr = buf->b_hdr;
2178         int hashed = hdr->b_l1hdr.b_state != arc_anon;
2179
2180         ASSERT(buf->b_efunc == NULL);
2181         ASSERT(buf->b_data != NULL);
2182
2183         if (hashed) {
2184                 kmutex_t *hash_lock = HDR_LOCK(hdr);
2185
2186                 mutex_enter(hash_lock);
2187                 hdr = buf->b_hdr;
2188                 ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
2189
2190                 (void) remove_reference(hdr, hash_lock, tag);
2191                 if (hdr->b_l1hdr.b_datacnt > 1) {
2192                         arc_buf_destroy(buf, TRUE);
2193                 } else {
2194                         ASSERT(buf == hdr->b_l1hdr.b_buf);
2195                         ASSERT(buf->b_efunc == NULL);
2196                         hdr->b_flags |= ARC_FLAG_BUF_AVAILABLE;
2197                 }
2198                 mutex_exit(hash_lock);
2199         } else if (HDR_IO_IN_PROGRESS(hdr)) {
2200                 int destroy_hdr;
2201                 /*
2202                  * We are in the middle of an async write.  Don't destroy
2203                  * this buffer unless the write completes before we finish
2204                  * decrementing the reference count.
2205                  */
2206                 mutex_enter(&arc_user_evicts_lock);
2207                 (void) remove_reference(hdr, NULL, tag);
2208                 ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
2209                 destroy_hdr = !HDR_IO_IN_PROGRESS(hdr);
2210                 mutex_exit(&arc_user_evicts_lock);
2211                 if (destroy_hdr)
2212                         arc_hdr_destroy(hdr);
2213         } else {
2214                 if (remove_reference(hdr, NULL, tag) > 0)
2215                         arc_buf_destroy(buf, TRUE);
2216                 else
2217                         arc_hdr_destroy(hdr);
2218         }
2219 }
2220
2221 boolean_t
2222 arc_buf_remove_ref(arc_buf_t *buf, void* tag)
2223 {
2224         arc_buf_hdr_t *hdr = buf->b_hdr;
2225         kmutex_t *hash_lock = NULL;
2226         boolean_t no_callback = (buf->b_efunc == NULL);
2227
2228         if (hdr->b_l1hdr.b_state == arc_anon) {
2229                 ASSERT(hdr->b_l1hdr.b_datacnt == 1);
2230                 arc_buf_free(buf, tag);
2231                 return (no_callback);
2232         }
2233
2234         hash_lock = HDR_LOCK(hdr);
2235         mutex_enter(hash_lock);
2236         hdr = buf->b_hdr;
2237         ASSERT(hdr->b_l1hdr.b_datacnt > 0);
2238         ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
2239         ASSERT(hdr->b_l1hdr.b_state != arc_anon);
2240         ASSERT(buf->b_data != NULL);
2241
2242         (void) remove_reference(hdr, hash_lock, tag);
2243         if (hdr->b_l1hdr.b_datacnt > 1) {
2244                 if (no_callback)
2245                         arc_buf_destroy(buf, TRUE);
2246         } else if (no_callback) {
2247                 ASSERT(hdr->b_l1hdr.b_buf == buf && buf->b_next == NULL);
2248                 ASSERT(buf->b_efunc == NULL);
2249                 hdr->b_flags |= ARC_FLAG_BUF_AVAILABLE;
2250         }
2251         ASSERT(no_callback || hdr->b_l1hdr.b_datacnt > 1 ||
2252             refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
2253         mutex_exit(hash_lock);
2254         return (no_callback);
2255 }
2256
2257 uint64_t
2258 arc_buf_size(arc_buf_t *buf)
2259 {
2260         return (buf->b_hdr->b_size);
2261 }
2262
2263 /*
2264  * Called from the DMU to determine if the current buffer should be
2265  * evicted. In order to ensure proper locking, the eviction must be initiated
2266  * from the DMU. Return true if the buffer is associated with user data and
2267  * duplicate buffers still exist.
2268  */
2269 boolean_t
2270 arc_buf_eviction_needed(arc_buf_t *buf)
2271 {
2272         arc_buf_hdr_t *hdr;
2273         boolean_t evict_needed = B_FALSE;
2274
2275         if (zfs_disable_dup_eviction)
2276                 return (B_FALSE);
2277
2278         mutex_enter(&buf->b_evict_lock);
2279         hdr = buf->b_hdr;
2280         if (hdr == NULL) {
2281                 /*
2282                  * We are in arc_do_user_evicts(); let that function
2283                  * perform the eviction.
2284                  */
2285                 ASSERT(buf->b_data == NULL);
2286                 mutex_exit(&buf->b_evict_lock);
2287                 return (B_FALSE);
2288         } else if (buf->b_data == NULL) {
2289                 /*
2290                  * We have already been added to the arc eviction list;
2291                  * recommend eviction.
2292                  */
2293                 ASSERT3P(hdr, ==, &arc_eviction_hdr);
2294                 mutex_exit(&buf->b_evict_lock);
2295                 return (B_TRUE);
2296         }
2297
2298         if (hdr->b_l1hdr.b_datacnt > 1 && HDR_ISTYPE_DATA(hdr))
2299                 evict_needed = B_TRUE;
2300
2301         mutex_exit(&buf->b_evict_lock);
2302         return (evict_needed);
2303 }
2304
2305 /*
2306  * Evict the arc_buf_hdr that is provided as a parameter. The resultant
2307  * state of the header is dependent on its state prior to entering this
2308  * function. The following transitions are possible:
2309  *
2310  *    - arc_mru -> arc_mru_ghost
2311  *    - arc_mfu -> arc_mfu_ghost
2312  *    - arc_mru_ghost -> arc_l2c_only
2313  *    - arc_mru_ghost -> deleted
2314  *    - arc_mfu_ghost -> arc_l2c_only
2315  *    - arc_mfu_ghost -> deleted
2316  */
2317 static int64_t
2318 arc_evict_hdr(arc_buf_hdr_t *hdr, kmutex_t *hash_lock)
2319 {
2320         arc_state_t *evicted_state, *state;
2321         int64_t bytes_evicted = 0;
2322
2323         ASSERT(MUTEX_HELD(hash_lock));
2324         ASSERT(HDR_HAS_L1HDR(hdr));
2325
2326         state = hdr->b_l1hdr.b_state;
2327         if (GHOST_STATE(state)) {
2328                 ASSERT(!HDR_IO_IN_PROGRESS(hdr));
2329                 ASSERT(hdr->b_l1hdr.b_buf == NULL);
2330
2331                 /*
2332                  * l2arc_write_buffers() relies on a header's L1 portion
2333                  * (i.e. its b_tmp_cdata field) during its write phase.
2334                  * Thus, we cannot push a header onto the arc_l2c_only
2335                  * state (removing its L1 piece) until the header is
2336                  * done being written to the l2arc.
2337                  */
2338                 if (HDR_HAS_L2HDR(hdr) && HDR_L2_WRITING(hdr)) {
2339                         ARCSTAT_BUMP(arcstat_evict_l2_skip);
2340                         return (bytes_evicted);
2341                 }
2342
2343                 ARCSTAT_BUMP(arcstat_deleted);
2344                 bytes_evicted += hdr->b_size;
2345
2346                 DTRACE_PROBE1(arc__delete, arc_buf_hdr_t *, hdr);
2347
2348                 if (HDR_HAS_L2HDR(hdr)) {
2349                         /*
2350                          * This buffer is cached on the 2nd Level ARC;
2351                          * don't destroy the header.
2352                          */
2353                         arc_change_state(arc_l2c_only, hdr, hash_lock);
2354                         /*
2355                          * dropping from L1+L2 cached to L2-only,
2356                          * realloc to remove the L1 header.
2357                          */
2358                         hdr = arc_hdr_realloc(hdr, hdr_full_cache,
2359                             hdr_l2only_cache);
2360                 } else {
2361                         arc_change_state(arc_anon, hdr, hash_lock);
2362                         arc_hdr_destroy(hdr);
2363                 }
2364                 return (bytes_evicted);
2365         }
2366
2367         ASSERT(state == arc_mru || state == arc_mfu);
2368         evicted_state = (state == arc_mru) ? arc_mru_ghost : arc_mfu_ghost;
2369
2370         /* prefetch buffers have a minimum lifespan */
2371         if (HDR_IO_IN_PROGRESS(hdr) ||
2372             ((hdr->b_flags & (ARC_FLAG_PREFETCH | ARC_FLAG_INDIRECT)) &&
2373             ddi_get_lbolt() - hdr->b_l1hdr.b_arc_access <
2374             arc_min_prefetch_lifespan)) {
2375                 ARCSTAT_BUMP(arcstat_evict_skip);
2376                 return (bytes_evicted);
2377         }
2378
2379         ASSERT0(refcount_count(&hdr->b_l1hdr.b_refcnt));
2380         ASSERT3U(hdr->b_l1hdr.b_datacnt, >, 0);
2381         while (hdr->b_l1hdr.b_buf) {
2382                 arc_buf_t *buf = hdr->b_l1hdr.b_buf;
2383                 if (!mutex_tryenter(&buf->b_evict_lock)) {
2384                         ARCSTAT_BUMP(arcstat_mutex_miss);
2385                         break;
2386                 }
2387                 if (buf->b_data != NULL)
2388                         bytes_evicted += hdr->b_size;
2389                 if (buf->b_efunc != NULL) {
2390                         mutex_enter(&arc_user_evicts_lock);
2391                         arc_buf_destroy(buf, FALSE);
2392                         hdr->b_l1hdr.b_buf = buf->b_next;
2393                         buf->b_hdr = &arc_eviction_hdr;
2394                         buf->b_next = arc_eviction_list;
2395                         arc_eviction_list = buf;
2396                         cv_signal(&arc_user_evicts_cv);
2397                         mutex_exit(&arc_user_evicts_lock);
2398                         mutex_exit(&buf->b_evict_lock);
2399                 } else {
2400                         mutex_exit(&buf->b_evict_lock);
2401                         arc_buf_destroy(buf, TRUE);
2402                 }
2403         }
2404
2405         if (HDR_HAS_L2HDR(hdr)) {
2406                 ARCSTAT_INCR(arcstat_evict_l2_cached, hdr->b_size);
2407         } else {
2408                 if (l2arc_write_eligible(hdr->b_spa, hdr))
2409                         ARCSTAT_INCR(arcstat_evict_l2_eligible, hdr->b_size);
2410                 else
2411                         ARCSTAT_INCR(arcstat_evict_l2_ineligible, hdr->b_size);
2412         }
2413
2414         if (hdr->b_l1hdr.b_datacnt == 0) {
2415                 arc_change_state(evicted_state, hdr, hash_lock);
2416                 ASSERT(HDR_IN_HASH_TABLE(hdr));
2417                 hdr->b_flags |= ARC_FLAG_IN_HASH_TABLE;
2418                 hdr->b_flags &= ~ARC_FLAG_BUF_AVAILABLE;
2419                 DTRACE_PROBE1(arc__evict, arc_buf_hdr_t *, hdr);
2420         }
2421
2422         return (bytes_evicted);
2423 }
2424
2425 static uint64_t
2426 arc_evict_state_impl(multilist_t *ml, int idx, arc_buf_hdr_t *marker,
2427     uint64_t spa, int64_t bytes)
2428 {
2429         multilist_sublist_t *mls;
2430         uint64_t bytes_evicted = 0;
2431         arc_buf_hdr_t *hdr;
2432         kmutex_t *hash_lock;
2433         int evict_count = 0;
2434
2435         ASSERT3P(marker, !=, NULL);
2436         ASSERTV(if (bytes < 0) ASSERT(bytes == ARC_EVICT_ALL));
2437
2438         mls = multilist_sublist_lock(ml, idx);
2439
2440         for (hdr = multilist_sublist_prev(mls, marker); hdr != NULL;
2441             hdr = multilist_sublist_prev(mls, marker)) {
2442                 if ((bytes != ARC_EVICT_ALL && bytes_evicted >= bytes) ||
2443                     (evict_count >= zfs_arc_evict_batch_limit))
2444                         break;
2445
2446                 /*
2447                  * To keep our iteration location, move the marker
2448                  * forward. Since we're not holding hdr's hash lock, we
2449                  * must be very careful and not remove 'hdr' from the
2450                  * sublist. Otherwise, other consumers might mistake the
2451                  * 'hdr' as not being on a sublist when they call the
2452                  * multilist_link_active() function (they all rely on
2453                  * the hash lock protecting concurrent insertions and
2454                  * removals). multilist_sublist_move_forward() was
2455                  * specifically implemented to ensure this is the case
2456                  * (only 'marker' will be removed and re-inserted).
2457                  */
2458                 multilist_sublist_move_forward(mls, marker);
2459
2460                 /*
2461                  * The only case where the b_spa field should ever be
2462                  * zero, is the marker headers inserted by
2463                  * arc_evict_state(). It's possible for multiple threads
2464                  * to be calling arc_evict_state() concurrently (e.g.
2465                  * dsl_pool_close() and zio_inject_fault()), so we must
2466                  * skip any markers we see from these other threads.
2467                  */
2468                 if (hdr->b_spa == 0)
2469                         continue;
2470
2471                 /* we're only interested in evicting buffers of a certain spa */
2472                 if (spa != 0 && hdr->b_spa != spa) {
2473                         ARCSTAT_BUMP(arcstat_evict_skip);
2474                         continue;
2475                 }
2476
2477                 hash_lock = HDR_LOCK(hdr);
2478
2479                 /*
2480                  * We aren't calling this function from any code path
2481                  * that would already be holding a hash lock, so we're
2482                  * asserting on this assumption to be defensive in case
2483                  * this ever changes. Without this check, it would be
2484                  * possible to incorrectly increment arcstat_mutex_miss
2485                  * below (e.g. if the code changed such that we called
2486                  * this function with a hash lock held).
2487                  */
2488                 ASSERT(!MUTEX_HELD(hash_lock));
2489
2490                 if (mutex_tryenter(hash_lock)) {
2491                         uint64_t evicted = arc_evict_hdr(hdr, hash_lock);
2492                         mutex_exit(hash_lock);
2493
2494                         bytes_evicted += evicted;
2495
2496                         /*
2497                          * If evicted is zero, arc_evict_hdr() must have
2498                          * decided to skip this header, don't increment
2499                          * evict_count in this case.
2500                          */
2501                         if (evicted != 0)
2502                                 evict_count++;
2503
2504                         /*
2505                          * If arc_size isn't overflowing, signal any
2506                          * threads that might happen to be waiting.
2507                          *
2508                          * For each header evicted, we wake up a single
2509                          * thread. If we used cv_broadcast, we could
2510                          * wake up "too many" threads causing arc_size
2511                          * to significantly overflow arc_c; since
2512                          * arc_get_data_buf() doesn't check for overflow
2513                          * when it's woken up (it doesn't because it's
2514                          * possible for the ARC to be overflowing while
2515                          * full of un-evictable buffers, and the
2516                          * function should proceed in this case).
2517                          *
2518                          * If threads are left sleeping, due to not
2519                          * using cv_broadcast, they will be woken up
2520                          * just before arc_reclaim_thread() sleeps.
2521                          */
2522                         mutex_enter(&arc_reclaim_lock);
2523                         if (!arc_is_overflowing())
2524                                 cv_signal(&arc_reclaim_waiters_cv);
2525                         mutex_exit(&arc_reclaim_lock);
2526                 } else {
2527                         ARCSTAT_BUMP(arcstat_mutex_miss);
2528                 }
2529         }
2530
2531         multilist_sublist_unlock(mls);
2532
2533         return (bytes_evicted);
2534 }
2535
2536 /*
2537  * Evict buffers from the given arc state, until we've removed the
2538  * specified number of bytes. Move the removed buffers to the
2539  * appropriate evict state.
2540  *
2541  * This function makes a "best effort". It skips over any buffers
2542  * it can't get a hash_lock on, and so, may not catch all candidates.
2543  * It may also return without evicting as much space as requested.
2544  *
2545  * If bytes is specified using the special value ARC_EVICT_ALL, this
2546  * will evict all available (i.e. unlocked and evictable) buffers from
2547  * the given arc state; which is used by arc_flush().
2548  */
2549 static uint64_t
2550 arc_evict_state(arc_state_t *state, uint64_t spa, int64_t bytes,
2551     arc_buf_contents_t type)
2552 {
2553         uint64_t total_evicted = 0;
2554         multilist_t *ml = &state->arcs_list[type];
2555         int num_sublists;
2556         arc_buf_hdr_t **markers;
2557         int i;
2558
2559         ASSERTV(if (bytes < 0) ASSERT(bytes == ARC_EVICT_ALL));
2560
2561         num_sublists = multilist_get_num_sublists(ml);
2562
2563         /*
2564          * If we've tried to evict from each sublist, made some
2565          * progress, but still have not hit the target number of bytes
2566          * to evict, we want to keep trying. The markers allow us to
2567          * pick up where we left off for each individual sublist, rather
2568          * than starting from the tail each time.
2569          */
2570         markers = kmem_zalloc(sizeof (*markers) * num_sublists, KM_SLEEP);
2571         for (i = 0; i < num_sublists; i++) {
2572                 multilist_sublist_t *mls;
2573
2574                 markers[i] = kmem_cache_alloc(hdr_full_cache, KM_SLEEP);
2575
2576                 /*
2577                  * A b_spa of 0 is used to indicate that this header is
2578                  * a marker. This fact is used in arc_adjust_type() and
2579                  * arc_evict_state_impl().
2580                  */
2581                 markers[i]->b_spa = 0;
2582
2583                 mls = multilist_sublist_lock(ml, i);
2584                 multilist_sublist_insert_tail(mls, markers[i]);
2585                 multilist_sublist_unlock(mls);
2586         }
2587
2588         /*
2589          * While we haven't hit our target number of bytes to evict, or
2590          * we're evicting all available buffers.
2591          */
2592         while (total_evicted < bytes || bytes == ARC_EVICT_ALL) {
2593                 /*
2594                  * Start eviction using a randomly selected sublist,
2595                  * this is to try and evenly balance eviction across all
2596                  * sublists. Always starting at the same sublist
2597                  * (e.g. index 0) would cause evictions to favor certain
2598                  * sublists over others.
2599                  */
2600                 int sublist_idx = multilist_get_random_index(ml);
2601                 uint64_t scan_evicted = 0;
2602
2603                 for (i = 0; i < num_sublists; i++) {
2604                         uint64_t bytes_remaining;
2605                         uint64_t bytes_evicted;
2606
2607                         if (bytes == ARC_EVICT_ALL)
2608                                 bytes_remaining = ARC_EVICT_ALL;
2609                         else if (total_evicted < bytes)
2610                                 bytes_remaining = bytes - total_evicted;
2611                         else
2612                                 break;
2613
2614                         bytes_evicted = arc_evict_state_impl(ml, sublist_idx,
2615                             markers[sublist_idx], spa, bytes_remaining);
2616
2617                         scan_evicted += bytes_evicted;
2618                         total_evicted += bytes_evicted;
2619
2620                         /* we've reached the end, wrap to the beginning */
2621                         if (++sublist_idx >= num_sublists)
2622                                 sublist_idx = 0;
2623                 }
2624
2625                 /*
2626                  * If we didn't evict anything during this scan, we have
2627                  * no reason to believe we'll evict more during another
2628                  * scan, so break the loop.
2629                  */
2630                 if (scan_evicted == 0) {
2631                         /* This isn't possible, let's make that obvious */
2632                         ASSERT3S(bytes, !=, 0);
2633
2634                         /*
2635                          * When bytes is ARC_EVICT_ALL, the only way to
2636                          * break the loop is when scan_evicted is zero.
2637                          * In that case, we actually have evicted enough,
2638                          * so we don't want to increment the kstat.
2639                          */
2640                         if (bytes != ARC_EVICT_ALL) {
2641                                 ASSERT3S(total_evicted, <, bytes);
2642                                 ARCSTAT_BUMP(arcstat_evict_not_enough);
2643                         }
2644
2645                         break;
2646                 }
2647         }
2648
2649         for (i = 0; i < num_sublists; i++) {
2650                 multilist_sublist_t *mls = multilist_sublist_lock(ml, i);
2651                 multilist_sublist_remove(mls, markers[i]);
2652                 multilist_sublist_unlock(mls);
2653
2654                 kmem_cache_free(hdr_full_cache, markers[i]);
2655         }
2656         kmem_free(markers, sizeof (*markers) * num_sublists);
2657
2658         return (total_evicted);
2659 }
2660
2661 /*
2662  * Flush all "evictable" data of the given type from the arc state
2663  * specified. This will not evict any "active" buffers (i.e. referenced).
2664  *
2665  * When 'retry' is set to FALSE, the function will make a single pass
2666  * over the state and evict any buffers that it can. Since it doesn't
2667  * continually retry the eviction, it might end up leaving some buffers
2668  * in the ARC due to lock misses.
2669  *
2670  * When 'retry' is set to TRUE, the function will continually retry the
2671  * eviction until *all* evictable buffers have been removed from the
2672  * state. As a result, if concurrent insertions into the state are
2673  * allowed (e.g. if the ARC isn't shutting down), this function might
2674  * wind up in an infinite loop, continually trying to evict buffers.
2675  */
2676 static uint64_t
2677 arc_flush_state(arc_state_t *state, uint64_t spa, arc_buf_contents_t type,
2678     boolean_t retry)
2679 {
2680         uint64_t evicted = 0;
2681
2682         while (state->arcs_lsize[type] != 0) {
2683                 evicted += arc_evict_state(state, spa, ARC_EVICT_ALL, type);
2684
2685                 if (!retry)
2686                         break;
2687         }
2688
2689         return (evicted);
2690 }
2691
2692 /*
2693  * Helper function for arc_prune() it is responsible for safely handling
2694  * the execution of a registered arc_prune_func_t.
2695  */
2696 static void
2697 arc_prune_task(void *ptr)
2698 {
2699         arc_prune_t *ap = (arc_prune_t *)ptr;
2700         arc_prune_func_t *func = ap->p_pfunc;
2701
2702         if (func != NULL)
2703                 func(ap->p_adjust, ap->p_private);
2704
2705         /* Callback unregistered concurrently with execution */
2706         if (refcount_remove(&ap->p_refcnt, func) == 0) {
2707                 ASSERT(!list_link_active(&ap->p_node));
2708                 refcount_destroy(&ap->p_refcnt);
2709                 kmem_free(ap, sizeof (*ap));
2710         }
2711 }
2712
2713 /*
2714  * Notify registered consumers they must drop holds on a portion of the ARC
2715  * buffered they reference.  This provides a mechanism to ensure the ARC can
2716  * honor the arc_meta_limit and reclaim otherwise pinned ARC buffers.  This
2717  * is analogous to dnlc_reduce_cache() but more generic.
2718  *
2719  * This operation is performed asyncronously so it may be safely called
2720  * in the context of the arc_reclaim_thread().  A reference is taken here
2721  * for each registered arc_prune_t and the arc_prune_task() is responsible
2722  * for releasing it once the registered arc_prune_func_t has completed.
2723  */
2724 static void
2725 arc_prune_async(int64_t adjust)
2726 {
2727         arc_prune_t *ap;
2728
2729         mutex_enter(&arc_prune_mtx);
2730         for (ap = list_head(&arc_prune_list); ap != NULL;
2731             ap = list_next(&arc_prune_list, ap)) {
2732
2733                 if (refcount_count(&ap->p_refcnt) >= 2)
2734                         continue;
2735
2736                 refcount_add(&ap->p_refcnt, ap->p_pfunc);
2737                 ap->p_adjust = adjust;
2738                 taskq_dispatch(arc_prune_taskq, arc_prune_task, ap, TQ_SLEEP);
2739                 ARCSTAT_BUMP(arcstat_prune);
2740         }
2741         mutex_exit(&arc_prune_mtx);
2742 }
2743
2744 static void
2745 arc_prune(int64_t adjust)
2746 {
2747         arc_prune_async(adjust);
2748         taskq_wait_outstanding(arc_prune_taskq, 0);
2749 }
2750
2751 /*
2752  * Evict the specified number of bytes from the state specified,
2753  * restricting eviction to the spa and type given. This function
2754  * prevents us from trying to evict more from a state's list than
2755  * is "evictable", and to skip evicting altogether when passed a
2756  * negative value for "bytes". In contrast, arc_evict_state() will
2757  * evict everything it can, when passed a negative value for "bytes".
2758  */
2759 static uint64_t
2760 arc_adjust_impl(arc_state_t *state, uint64_t spa, int64_t bytes,
2761     arc_buf_contents_t type)
2762 {
2763         int64_t delta;
2764
2765         if (bytes > 0 && state->arcs_lsize[type] > 0) {
2766                 delta = MIN(state->arcs_lsize[type], bytes);
2767                 return (arc_evict_state(state, spa, delta, type));
2768         }
2769
2770         return (0);
2771 }
2772
2773 /*
2774  * The goal of this function is to evict enough meta data buffers from the
2775  * ARC in order to enforce the arc_meta_limit.  Achieving this is slightly
2776  * more complicated than it appears because it is common for data buffers
2777  * to have holds on meta data buffers.  In addition, dnode meta data buffers
2778  * will be held by the dnodes in the block preventing them from being freed.
2779  * This means we can't simply traverse the ARC and expect to always find
2780  * enough unheld meta data buffer to release.
2781  *
2782  * Therefore, this function has been updated to make alternating passes
2783  * over the ARC releasing data buffers and then newly unheld meta data
2784  * buffers.  This ensures forward progress is maintained and arc_meta_used
2785  * will decrease.  Normally this is sufficient, but if required the ARC
2786  * will call the registered prune callbacks causing dentry and inodes to
2787  * be dropped from the VFS cache.  This will make dnode meta data buffers
2788  * available for reclaim.
2789  */
2790 static uint64_t
2791 arc_adjust_meta_balanced(void)
2792 {
2793         int64_t adjustmnt, delta, prune = 0;
2794         uint64_t total_evicted = 0;
2795         arc_buf_contents_t type = ARC_BUFC_DATA;
2796         int restarts = MAX(zfs_arc_meta_adjust_restarts, 0);
2797
2798 restart:
2799         /*
2800          * This slightly differs than the way we evict from the mru in
2801          * arc_adjust because we don't have a "target" value (i.e. no
2802          * "meta" arc_p). As a result, I think we can completely
2803          * cannibalize the metadata in the MRU before we evict the
2804          * metadata from the MFU. I think we probably need to implement a
2805          * "metadata arc_p" value to do this properly.
2806          */
2807         adjustmnt = arc_meta_used - arc_meta_limit;
2808
2809         if (adjustmnt > 0 && arc_mru->arcs_lsize[type] > 0) {
2810                 delta = MIN(arc_mru->arcs_lsize[type], adjustmnt);
2811                 total_evicted += arc_adjust_impl(arc_mru, 0, delta, type);
2812                 adjustmnt -= delta;
2813         }
2814
2815         /*
2816          * We can't afford to recalculate adjustmnt here. If we do,
2817          * new metadata buffers can sneak into the MRU or ANON lists,
2818          * thus penalize the MFU metadata. Although the fudge factor is
2819          * small, it has been empirically shown to be significant for
2820          * certain workloads (e.g. creating many empty directories). As
2821          * such, we use the original calculation for adjustmnt, and
2822          * simply decrement the amount of data evicted from the MRU.
2823          */
2824
2825         if (adjustmnt > 0 && arc_mfu->arcs_lsize[type] > 0) {
2826                 delta = MIN(arc_mfu->arcs_lsize[type], adjustmnt);
2827                 total_evicted += arc_adjust_impl(arc_mfu, 0, delta, type);
2828         }
2829
2830         adjustmnt = arc_meta_used - arc_meta_limit;
2831
2832         if (adjustmnt > 0 && arc_mru_ghost->arcs_lsize[type] > 0) {
2833                 delta = MIN(adjustmnt,
2834                     arc_mru_ghost->arcs_lsize[type]);
2835                 total_evicted += arc_adjust_impl(arc_mru_ghost, 0, delta, type);
2836                 adjustmnt -= delta;
2837         }
2838
2839         if (adjustmnt > 0 && arc_mfu_ghost->arcs_lsize[type] > 0) {
2840                 delta = MIN(adjustmnt,
2841                     arc_mfu_ghost->arcs_lsize[type]);
2842                 total_evicted += arc_adjust_impl(arc_mfu_ghost, 0, delta, type);
2843         }
2844
2845         /*
2846          * If after attempting to make the requested adjustment to the ARC
2847          * the meta limit is still being exceeded then request that the
2848          * higher layers drop some cached objects which have holds on ARC
2849          * meta buffers.  Requests to the upper layers will be made with
2850          * increasingly large scan sizes until the ARC is below the limit.
2851          */
2852         if (arc_meta_used > arc_meta_limit) {
2853                 if (type == ARC_BUFC_DATA) {
2854                         type = ARC_BUFC_METADATA;
2855                 } else {
2856                         type = ARC_BUFC_DATA;
2857
2858                         if (zfs_arc_meta_prune) {
2859                                 prune += zfs_arc_meta_prune;
2860                                 arc_prune_async(prune);
2861                         }
2862                 }
2863
2864                 if (restarts > 0) {
2865                         restarts--;
2866                         goto restart;
2867                 }
2868         }
2869         return (total_evicted);
2870 }
2871
2872 /*
2873  * Evict metadata buffers from the cache, such that arc_meta_used is
2874  * capped by the arc_meta_limit tunable.
2875  */
2876 static uint64_t
2877 arc_adjust_meta_only(void)
2878 {
2879         uint64_t total_evicted = 0;
2880         int64_t target;
2881
2882         /*
2883          * If we're over the meta limit, we want to evict enough
2884          * metadata to get back under the meta limit. We don't want to
2885          * evict so much that we drop the MRU below arc_p, though. If
2886          * we're over the meta limit more than we're over arc_p, we
2887          * evict some from the MRU here, and some from the MFU below.
2888          */
2889         target = MIN((int64_t)(arc_meta_used - arc_meta_limit),
2890             (int64_t)(refcount_count(&arc_anon->arcs_size) +
2891             refcount_count(&arc_mru->arcs_size) - arc_p));
2892
2893         total_evicted += arc_adjust_impl(arc_mru, 0, target, ARC_BUFC_METADATA);
2894
2895         /*
2896          * Similar to the above, we want to evict enough bytes to get us
2897          * below the meta limit, but not so much as to drop us below the
2898          * space alloted to the MFU (which is defined as arc_c - arc_p).
2899          */
2900         target = MIN((int64_t)(arc_meta_used - arc_meta_limit),
2901             (int64_t)(refcount_count(&arc_mfu->arcs_size) - (arc_c - arc_p)));
2902
2903         total_evicted += arc_adjust_impl(arc_mfu, 0, target, ARC_BUFC_METADATA);
2904
2905         return (total_evicted);
2906 }
2907
2908 static uint64_t
2909 arc_adjust_meta(void)
2910 {
2911         if (zfs_arc_meta_strategy == ARC_STRATEGY_META_ONLY)
2912                 return (arc_adjust_meta_only());
2913         else
2914                 return (arc_adjust_meta_balanced());
2915 }
2916
2917 /*
2918  * Return the type of the oldest buffer in the given arc state
2919  *
2920  * This function will select a random sublist of type ARC_BUFC_DATA and
2921  * a random sublist of type ARC_BUFC_METADATA. The tail of each sublist
2922  * is compared, and the type which contains the "older" buffer will be
2923  * returned.
2924  */
2925 static arc_buf_contents_t
2926 arc_adjust_type(arc_state_t *state)
2927 {
2928         multilist_t *data_ml = &state->arcs_list[ARC_BUFC_DATA];
2929         multilist_t *meta_ml = &state->arcs_list[ARC_BUFC_METADATA];
2930         int data_idx = multilist_get_random_index(data_ml);
2931         int meta_idx = multilist_get_random_index(meta_ml);
2932         multilist_sublist_t *data_mls;
2933         multilist_sublist_t *meta_mls;
2934         arc_buf_contents_t type;
2935         arc_buf_hdr_t *data_hdr;
2936         arc_buf_hdr_t *meta_hdr;
2937
2938         /*
2939          * We keep the sublist lock until we're finished, to prevent
2940          * the headers from being destroyed via arc_evict_state().
2941          */
2942         data_mls = multilist_sublist_lock(data_ml, data_idx);
2943         meta_mls = multilist_sublist_lock(meta_ml, meta_idx);
2944
2945         /*
2946          * These two loops are to ensure we skip any markers that
2947          * might be at the tail of the lists due to arc_evict_state().
2948          */
2949
2950         for (data_hdr = multilist_sublist_tail(data_mls); data_hdr != NULL;
2951             data_hdr = multilist_sublist_prev(data_mls, data_hdr)) {
2952                 if (data_hdr->b_spa != 0)
2953                         break;
2954         }
2955
2956         for (meta_hdr = multilist_sublist_tail(meta_mls); meta_hdr != NULL;
2957             meta_hdr = multilist_sublist_prev(meta_mls, meta_hdr)) {
2958                 if (meta_hdr->b_spa != 0)
2959                         break;
2960         }
2961
2962         if (data_hdr == NULL && meta_hdr == NULL) {
2963                 type = ARC_BUFC_DATA;
2964         } else if (data_hdr == NULL) {
2965                 ASSERT3P(meta_hdr, !=, NULL);
2966                 type = ARC_BUFC_METADATA;
2967         } else if (meta_hdr == NULL) {
2968                 ASSERT3P(data_hdr, !=, NULL);
2969                 type = ARC_BUFC_DATA;
2970         } else {
2971                 ASSERT3P(data_hdr, !=, NULL);
2972                 ASSERT3P(meta_hdr, !=, NULL);
2973
2974                 /* The headers can't be on the sublist without an L1 header */
2975                 ASSERT(HDR_HAS_L1HDR(data_hdr));
2976                 ASSERT(HDR_HAS_L1HDR(meta_hdr));
2977
2978                 if (data_hdr->b_l1hdr.b_arc_access <
2979                     meta_hdr->b_l1hdr.b_arc_access) {
2980                         type = ARC_BUFC_DATA;
2981                 } else {
2982                         type = ARC_BUFC_METADATA;
2983                 }
2984         }
2985
2986         multilist_sublist_unlock(meta_mls);
2987         multilist_sublist_unlock(data_mls);
2988
2989         return (type);
2990 }
2991
2992 /*
2993  * Evict buffers from the cache, such that arc_size is capped by arc_c.
2994  */
2995 static uint64_t
2996 arc_adjust(void)
2997 {
2998         uint64_t total_evicted = 0;
2999         uint64_t bytes;
3000         int64_t target;
3001
3002         /*
3003          * If we're over arc_meta_limit, we want to correct that before
3004          * potentially evicting data buffers below.
3005          */
3006         total_evicted += arc_adjust_meta();
3007
3008         /*
3009          * Adjust MRU size
3010          *
3011          * If we're over the target cache size, we want to evict enough
3012          * from the list to get back to our target size. We don't want
3013          * to evict too much from the MRU, such that it drops below
3014          * arc_p. So, if we're over our target cache size more than
3015          * the MRU is over arc_p, we'll evict enough to get back to
3016          * arc_p here, and then evict more from the MFU below.
3017          */
3018         target = MIN((int64_t)(arc_size - arc_c),
3019             (int64_t)(refcount_count(&arc_anon->arcs_size) +
3020             refcount_count(&arc_mru->arcs_size) + arc_meta_used - arc_p));
3021
3022         /*
3023          * If we're below arc_meta_min, always prefer to evict data.
3024          * Otherwise, try to satisfy the requested number of bytes to
3025          * evict from the type which contains older buffers; in an
3026          * effort to keep newer buffers in the cache regardless of their
3027          * type. If we cannot satisfy the number of bytes from this
3028          * type, spill over into the next type.
3029          */
3030         if (arc_adjust_type(arc_mru) == ARC_BUFC_METADATA &&
3031             arc_meta_used > arc_meta_min) {
3032                 bytes = arc_adjust_impl(arc_mru, 0, target, ARC_BUFC_METADATA);
3033                 total_evicted += bytes;
3034
3035                 /*
3036                  * If we couldn't evict our target number of bytes from
3037                  * metadata, we try to get the rest from data.
3038                  */
3039                 target -= bytes;
3040
3041                 total_evicted +=
3042                     arc_adjust_impl(arc_mru, 0, target, ARC_BUFC_DATA);
3043         } else {
3044                 bytes = arc_adjust_impl(arc_mru, 0, target, ARC_BUFC_DATA);
3045                 total_evicted += bytes;
3046
3047                 /*
3048                  * If we couldn't evict our target number of bytes from
3049                  * data, we try to get the rest from metadata.
3050                  */
3051                 target -= bytes;
3052
3053                 total_evicted +=
3054                     arc_adjust_impl(arc_mru, 0, target, ARC_BUFC_METADATA);
3055         }
3056
3057         /*
3058          * Adjust MFU size
3059          *
3060          * Now that we've tried to evict enough from the MRU to get its
3061          * size back to arc_p, if we're still above the target cache
3062          * size, we evict the rest from the MFU.
3063          */
3064         target = arc_size - arc_c;
3065
3066         if (arc_adjust_type(arc_mfu) == ARC_BUFC_METADATA &&
3067             arc_meta_used > arc_meta_min) {
3068                 bytes = arc_adjust_impl(arc_mfu, 0, target, ARC_BUFC_METADATA);
3069                 total_evicted += bytes;
3070
3071                 /*
3072                  * If we couldn't evict our target number of bytes from
3073                  * metadata, we try to get the rest from data.
3074                  */
3075                 target -= bytes;
3076
3077                 total_evicted +=
3078                     arc_adjust_impl(arc_mfu, 0, target, ARC_BUFC_DATA);
3079         } else {
3080                 bytes = arc_adjust_impl(arc_mfu, 0, target, ARC_BUFC_DATA);
3081                 total_evicted += bytes;
3082
3083                 /*
3084                  * If we couldn't evict our target number of bytes from
3085                  * data, we try to get the rest from data.
3086                  */
3087                 target -= bytes;
3088
3089                 total_evicted +=
3090                     arc_adjust_impl(arc_mfu, 0, target, ARC_BUFC_METADATA);
3091         }
3092
3093         /*
3094          * Adjust ghost lists
3095          *
3096          * In addition to the above, the ARC also defines target values
3097          * for the ghost lists. The sum of the mru list and mru ghost
3098          * list should never exceed the target size of the cache, and
3099          * the sum of the mru list, mfu list, mru ghost list, and mfu
3100          * ghost list should never exceed twice the target size of the
3101          * cache. The following logic enforces these limits on the ghost
3102          * caches, and evicts from them as needed.
3103          */
3104         target = refcount_count(&arc_mru->arcs_size) +
3105             refcount_count(&arc_mru_ghost->arcs_size) - arc_c;
3106
3107         bytes = arc_adjust_impl(arc_mru_ghost, 0, target, ARC_BUFC_DATA);
3108         total_evicted += bytes;
3109
3110         target -= bytes;
3111
3112         total_evicted +=
3113             arc_adjust_impl(arc_mru_ghost, 0, target, ARC_BUFC_METADATA);
3114
3115         /*
3116          * We assume the sum of the mru list and mfu list is less than
3117          * or equal to arc_c (we enforced this above), which means we
3118          * can use the simpler of the two equations below:
3119          *
3120          *      mru + mfu + mru ghost + mfu ghost <= 2 * arc_c
3121          *                  mru ghost + mfu ghost <= arc_c
3122          */
3123         target = refcount_count(&arc_mru_ghost->arcs_size) +
3124             refcount_count(&arc_mfu_ghost->arcs_size) - arc_c;
3125
3126         bytes = arc_adjust_impl(arc_mfu_ghost, 0, target, ARC_BUFC_DATA);
3127         total_evicted += bytes;
3128
3129         target -= bytes;
3130
3131         total_evicted +=
3132             arc_adjust_impl(arc_mfu_ghost, 0, target, ARC_BUFC_METADATA);
3133
3134         return (total_evicted);
3135 }
3136
3137 static void
3138 arc_do_user_evicts(void)
3139 {
3140         mutex_enter(&arc_user_evicts_lock);
3141         while (arc_eviction_list != NULL) {
3142                 arc_buf_t *buf = arc_eviction_list;
3143                 arc_eviction_list = buf->b_next;
3144                 mutex_enter(&buf->b_evict_lock);
3145                 buf->b_hdr = NULL;
3146                 mutex_exit(&buf->b_evict_lock);
3147                 mutex_exit(&arc_user_evicts_lock);
3148
3149                 if (buf->b_efunc != NULL)
3150                         VERIFY0(buf->b_efunc(buf->b_private));
3151
3152                 buf->b_efunc = NULL;
3153                 buf->b_private = NULL;
3154                 kmem_cache_free(buf_cache, buf);
3155                 mutex_enter(&arc_user_evicts_lock);
3156         }
3157         mutex_exit(&arc_user_evicts_lock);
3158 }
3159
3160 void
3161 arc_flush(spa_t *spa, boolean_t retry)
3162 {
3163         uint64_t guid = 0;
3164
3165         /*
3166          * If retry is TRUE, a spa must not be specified since we have
3167          * no good way to determine if all of a spa's buffers have been
3168          * evicted from an arc state.
3169          */
3170         ASSERT(!retry || spa == 0);
3171
3172         if (spa != NULL)
3173                 guid = spa_load_guid(spa);
3174
3175         (void) arc_flush_state(arc_mru, guid, ARC_BUFC_DATA, retry);
3176         (void) arc_flush_state(arc_mru, guid, ARC_BUFC_METADATA, retry);
3177
3178         (void) arc_flush_state(arc_mfu, guid, ARC_BUFC_DATA, retry);
3179         (void) arc_flush_state(arc_mfu, guid, ARC_BUFC_METADATA, retry);
3180
3181         (void) arc_flush_state(arc_mru_ghost, guid, ARC_BUFC_DATA, retry);
3182         (void) arc_flush_state(arc_mru_ghost, guid, ARC_BUFC_METADATA, retry);
3183
3184         (void) arc_flush_state(arc_mfu_ghost, guid, ARC_BUFC_DATA, retry);
3185         (void) arc_flush_state(arc_mfu_ghost, guid, ARC_BUFC_METADATA, retry);
3186
3187         arc_do_user_evicts();
3188         ASSERT(spa || arc_eviction_list == NULL);
3189 }
3190
3191 void
3192 arc_shrink(int64_t to_free)
3193 {
3194         if (arc_c > arc_c_min) {
3195
3196                 if (arc_c > arc_c_min + to_free)
3197                         atomic_add_64(&arc_c, -to_free);
3198                 else
3199                         arc_c = arc_c_min;
3200
3201                 atomic_add_64(&arc_p, -(arc_p >> arc_shrink_shift));
3202                 if (arc_c > arc_size)
3203                         arc_c = MAX(arc_size, arc_c_min);
3204                 if (arc_p > arc_c)
3205                         arc_p = (arc_c >> 1);
3206                 ASSERT(arc_c >= arc_c_min);
3207                 ASSERT((int64_t)arc_p >= 0);
3208         }
3209
3210         if (arc_size > arc_c)
3211                 (void) arc_adjust();
3212 }
3213
3214 typedef enum free_memory_reason_t {
3215         FMR_UNKNOWN,
3216         FMR_NEEDFREE,
3217         FMR_LOTSFREE,
3218         FMR_SWAPFS_MINFREE,
3219         FMR_PAGES_PP_MAXIMUM,
3220         FMR_HEAP_ARENA,
3221         FMR_ZIO_ARENA,
3222 } free_memory_reason_t;
3223
3224 int64_t last_free_memory;
3225 free_memory_reason_t last_free_reason;
3226
3227 #ifdef _KERNEL
3228 #ifdef __linux__
3229 /*
3230  * expiration time for arc_no_grow set by direct memory reclaim.
3231  */
3232 static clock_t arc_grow_time = 0;
3233 #else
3234 /*
3235  * Additional reserve of pages for pp_reserve.
3236  */
3237 int64_t arc_pages_pp_reserve = 64;
3238
3239 /*
3240  * Additional reserve of pages for swapfs.
3241  */
3242 int64_t arc_swapfs_reserve = 64;
3243 #endif
3244 #endif /* _KERNEL */
3245
3246 /*
3247  * Return the amount of memory that can be consumed before reclaim will be
3248  * needed.  Positive if there is sufficient free memory, negative indicates
3249  * the amount of memory that needs to be freed up.
3250  */
3251 static int64_t
3252 arc_available_memory(void)
3253 {
3254         int64_t lowest = INT64_MAX;
3255         free_memory_reason_t r = FMR_UNKNOWN;
3256
3257 #ifdef _KERNEL
3258 #ifdef __linux__
3259         /*
3260          * Under Linux we are not allowed to directly interrogate the global
3261          * memory state.  Instead rely on observing that direct reclaim has
3262          * recently occurred therefore the system must be low on memory.  The
3263          * exact values returned are not critical but should be small.
3264          */
3265         if (ddi_time_after_eq(ddi_get_lbolt(), arc_grow_time))
3266                 lowest = PAGE_SIZE;
3267         else
3268                 lowest = -PAGE_SIZE;
3269 #else
3270         int64_t n;
3271
3272         /*
3273          * Platforms like illumos have greater visibility in to the memory
3274          * subsystem and can return a more detailed analysis of memory.
3275          */
3276         if (needfree > 0) {
3277                 n = PAGESIZE * (-needfree);
3278                 if (n < lowest) {
3279                         lowest = n;
3280                         r = FMR_NEEDFREE;
3281                 }
3282         }
3283
3284         /*
3285          * check that we're out of range of the pageout scanner.  It starts to
3286          * schedule paging if freemem is less than lotsfree and needfree.
3287          * lotsfree is the high-water mark for pageout, and needfree is the
3288          * number of needed free pages.  We add extra pages here to make sure
3289          * the scanner doesn't start up while we're freeing memory.
3290          */
3291         n = PAGESIZE * (freemem - lotsfree - needfree - desfree);
3292         if (n < lowest) {
3293                 lowest = n;
3294                 r = FMR_LOTSFREE;
3295         }
3296
3297         /*
3298          * check to make sure that swapfs has enough space so that anon
3299          * reservations can still succeed. anon_resvmem() checks that the
3300          * availrmem is greater than swapfs_minfree, and the number of reserved
3301          * swap pages.  We also add a bit of extra here just to prevent
3302          * circumstances from getting really dire.
3303          */
3304         n = PAGESIZE * (availrmem - swapfs_minfree - swapfs_reserve -
3305             desfree - arc_swapfs_reserve);
3306         if (n < lowest) {
3307                 lowest = n;
3308                 r = FMR_SWAPFS_MINFREE;
3309         }
3310
3311
3312         /*
3313          * Check that we have enough availrmem that memory locking (e.g., via
3314          * mlock(3C) or memcntl(2)) can still succeed.  (pages_pp_maximum
3315          * stores the number of pages that cannot be locked; when availrmem
3316          * drops below pages_pp_maximum, page locking mechanisms such as
3317          * page_pp_lock() will fail.)
3318          */
3319         n = PAGESIZE * (availrmem - pages_pp_maximum -
3320             arc_pages_pp_reserve);
3321         if (n < lowest) {
3322                 lowest = n;
3323                 r = FMR_PAGES_PP_MAXIMUM;
3324         }
3325
3326 #if defined(__i386)
3327         /*
3328          * If we're on an i386 platform, it's possible that we'll exhaust the
3329          * kernel heap space before we ever run out of available physical
3330          * memory.  Most checks of the size of the heap_area compare against
3331          * tune.t_minarmem, which is the minimum available real memory that we
3332          * can have in the system.  However, this is generally fixed at 25 pages
3333          * which is so low that it's useless.  In this comparison, we seek to
3334          * calculate the total heap-size, and reclaim if more than 3/4ths of the
3335          * heap is allocated.  (Or, in the calculation, if less than 1/4th is
3336          * free)
3337          */
3338         n = vmem_size(heap_arena, VMEM_FREE) -
3339             (vmem_size(heap_arena, VMEM_FREE | VMEM_ALLOC) >> 2);
3340         if (n < lowest) {
3341                 lowest = n;
3342                 r = FMR_HEAP_ARENA;
3343         }
3344 #endif
3345
3346         /*
3347          * If zio data pages are being allocated out of a separate heap segment,
3348          * then enforce that the size of available vmem for this arena remains
3349          * above about 1/16th free.
3350          *
3351          * Note: The 1/16th arena free requirement was put in place
3352          * to aggressively evict memory from the arc in order to avoid
3353          * memory fragmentation issues.
3354          */
3355         if (zio_arena != NULL) {
3356                 n = vmem_size(zio_arena, VMEM_FREE) -
3357                     (vmem_size(zio_arena, VMEM_ALLOC) >> 4);
3358                 if (n < lowest) {
3359                         lowest = n;
3360                         r = FMR_ZIO_ARENA;
3361                 }
3362         }
3363 #endif /* __linux__ */
3364 #else
3365         /* Every 100 calls, free a small amount */
3366         if (spa_get_random(100) == 0)
3367                 lowest = -1024;
3368 #endif
3369
3370         last_free_memory = lowest;
3371         last_free_reason = r;
3372
3373         return (lowest);
3374 }
3375
3376 /*
3377  * Determine if the system is under memory pressure and is asking
3378  * to reclaim memory. A return value of TRUE indicates that the system
3379  * is under memory pressure and that the arc should adjust accordingly.
3380  */
3381 static boolean_t
3382 arc_reclaim_needed(void)
3383 {
3384         return (arc_available_memory() < 0);
3385 }
3386
3387 static void
3388 arc_kmem_reap_now(void)
3389 {
3390         size_t                  i;
3391         kmem_cache_t            *prev_cache = NULL;
3392         kmem_cache_t            *prev_data_cache = NULL;
3393         extern kmem_cache_t     *zio_buf_cache[];
3394         extern kmem_cache_t     *zio_data_buf_cache[];
3395         extern kmem_cache_t     *range_seg_cache;
3396
3397         if ((arc_meta_used >= arc_meta_limit) && zfs_arc_meta_prune) {
3398                 /*
3399                  * We are exceeding our meta-data cache limit.
3400                  * Prune some entries to release holds on meta-data.
3401                  */
3402                 arc_prune(zfs_arc_meta_prune);
3403         }
3404
3405         for (i = 0; i < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; i++) {
3406                 if (zio_buf_cache[i] != prev_cache) {
3407                         prev_cache = zio_buf_cache[i];
3408                         kmem_cache_reap_now(zio_buf_cache[i]);
3409                 }
3410                 if (zio_data_buf_cache[i] != prev_data_cache) {
3411                         prev_data_cache = zio_data_buf_cache[i];
3412                         kmem_cache_reap_now(zio_data_buf_cache[i]);
3413                 }
3414         }
3415         kmem_cache_reap_now(buf_cache);
3416         kmem_cache_reap_now(hdr_full_cache);
3417         kmem_cache_reap_now(hdr_l2only_cache);
3418         kmem_cache_reap_now(range_seg_cache);
3419
3420         if (zio_arena != NULL) {
3421                 /*
3422                  * Ask the vmem arena to reclaim unused memory from its
3423                  * quantum caches.
3424                  */
3425                 vmem_qcache_reap(zio_arena);
3426         }
3427 }
3428
3429 /*
3430  * Threads can block in arc_get_data_buf() waiting for this thread to evict
3431  * enough data and signal them to proceed. When this happens, the threads in
3432  * arc_get_data_buf() are sleeping while holding the hash lock for their
3433  * particular arc header. Thus, we must be careful to never sleep on a
3434  * hash lock in this thread. This is to prevent the following deadlock:
3435  *
3436  *  - Thread A sleeps on CV in arc_get_data_buf() holding hash lock "L",
3437  *    waiting for the reclaim thread to signal it.
3438  *
3439  *  - arc_reclaim_thread() tries to acquire hash lock "L" using mutex_enter,
3440  *    fails, and goes to sleep forever.
3441  *
3442  * This possible deadlock is avoided by always acquiring a hash lock
3443  * using mutex_tryenter() from arc_reclaim_thread().
3444  */
3445 static void
3446 arc_reclaim_thread(void)
3447 {
3448         fstrans_cookie_t        cookie = spl_fstrans_mark();
3449         clock_t                 growtime = 0;
3450         callb_cpr_t             cpr;
3451
3452         CALLB_CPR_INIT(&cpr, &arc_reclaim_lock, callb_generic_cpr, FTAG);
3453
3454         mutex_enter(&arc_reclaim_lock);
3455         while (!arc_reclaim_thread_exit) {
3456                 int64_t to_free;
3457                 int64_t free_memory = arc_available_memory();
3458                 uint64_t evicted = 0;
3459
3460                 arc_tuning_update();
3461
3462                 mutex_exit(&arc_reclaim_lock);
3463
3464                 if (free_memory < 0) {
3465
3466                         arc_no_grow = B_TRUE;
3467                         arc_warm = B_TRUE;
3468
3469                         /*
3470                          * Wait at least zfs_grow_retry (default 5) seconds
3471                          * before considering growing.
3472                          */
3473                         growtime = ddi_get_lbolt() + (arc_grow_retry * hz);
3474
3475                         arc_kmem_reap_now();
3476
3477                         /*
3478                          * If we are still low on memory, shrink the ARC
3479                          * so that we have arc_shrink_min free space.
3480                          */
3481                         free_memory = arc_available_memory();
3482
3483                         to_free = (arc_c >> arc_shrink_shift) - free_memory;
3484                         if (to_free > 0) {
3485 #ifdef _KERNEL
3486                                 to_free = MAX(to_free, ptob(needfree));
3487 #endif
3488                                 arc_shrink(to_free);
3489                         }
3490                 } else if (free_memory < arc_c >> arc_no_grow_shift) {
3491                         arc_no_grow = B_TRUE;
3492                 } else if (ddi_get_lbolt() >= growtime) {
3493                         arc_no_grow = B_FALSE;
3494                 }
3495
3496                 evicted = arc_adjust();
3497
3498                 mutex_enter(&arc_reclaim_lock);
3499
3500                 /*
3501                  * If evicted is zero, we couldn't evict anything via
3502                  * arc_adjust(). This could be due to hash lock
3503                  * collisions, but more likely due to the majority of
3504                  * arc buffers being unevictable. Therefore, even if
3505                  * arc_size is above arc_c, another pass is unlikely to
3506                  * be helpful and could potentially cause us to enter an
3507                  * infinite loop.
3508                  */
3509                 if (arc_size <= arc_c || evicted == 0) {
3510                         /*
3511                          * We're either no longer overflowing, or we
3512                          * can't evict anything more, so we should wake
3513                          * up any threads before we go to sleep.
3514                          */
3515                         cv_broadcast(&arc_reclaim_waiters_cv);
3516
3517                         /*
3518                          * Block until signaled, or after one second (we
3519                          * might need to perform arc_kmem_reap_now()
3520                          * even if we aren't being signalled)
3521                          */
3522                         CALLB_CPR_SAFE_BEGIN(&cpr);
3523                         (void) cv_timedwait_sig(&arc_reclaim_thread_cv,
3524                             &arc_reclaim_lock, ddi_get_lbolt() + hz);
3525                         CALLB_CPR_SAFE_END(&cpr, &arc_reclaim_lock);
3526                 }
3527         }
3528
3529         arc_reclaim_thread_exit = FALSE;
3530         cv_broadcast(&arc_reclaim_thread_cv);
3531         CALLB_CPR_EXIT(&cpr);           /* drops arc_reclaim_lock */
3532         spl_fstrans_unmark(cookie);
3533         thread_exit();
3534 }
3535
3536 static void
3537 arc_user_evicts_thread(void)
3538 {
3539         fstrans_cookie_t        cookie = spl_fstrans_mark();
3540         callb_cpr_t cpr;
3541
3542         CALLB_CPR_INIT(&cpr, &arc_user_evicts_lock, callb_generic_cpr, FTAG);
3543
3544         mutex_enter(&arc_user_evicts_lock);
3545         while (!arc_user_evicts_thread_exit) {
3546                 mutex_exit(&arc_user_evicts_lock);
3547
3548                 arc_do_user_evicts();
3549
3550                 /*
3551                  * This is necessary in order for the mdb ::arc dcmd to
3552                  * show up to date information. Since the ::arc command
3553                  * does not call the kstat's update function, without
3554                  * this call, the command may show stale stats for the
3555                  * anon, mru, mru_ghost, mfu, and mfu_ghost lists. Even
3556                  * with this change, the data might be up to 1 second
3557                  * out of date; but that should suffice. The arc_state_t
3558                  * structures can be queried directly if more accurate
3559                  * information is needed.
3560                  */
3561                 if (arc_ksp != NULL)
3562                         arc_ksp->ks_update(arc_ksp, KSTAT_READ);
3563
3564                 mutex_enter(&arc_user_evicts_lock);
3565
3566                 /*
3567                  * Block until signaled, or after one second (we need to
3568                  * call the arc's kstat update function regularly).
3569                  */
3570                 CALLB_CPR_SAFE_BEGIN(&cpr);
3571                 (void) cv_timedwait_sig(&arc_user_evicts_cv,
3572                     &arc_user_evicts_lock, ddi_get_lbolt() + hz);
3573                 CALLB_CPR_SAFE_END(&cpr, &arc_user_evicts_lock);
3574         }
3575
3576         arc_user_evicts_thread_exit = FALSE;
3577         cv_broadcast(&arc_user_evicts_cv);
3578         CALLB_CPR_EXIT(&cpr);           /* drops arc_user_evicts_lock */
3579         spl_fstrans_unmark(cookie);
3580         thread_exit();
3581 }
3582
3583 #ifdef _KERNEL
3584 /*
3585  * Determine the amount of memory eligible for eviction contained in the
3586  * ARC. All clean data reported by the ghost lists can always be safely
3587  * evicted. Due to arc_c_min, the same does not hold for all clean data
3588  * contained by the regular mru and mfu lists.
3589  *
3590  * In the case of the regular mru and mfu lists, we need to report as
3591  * much clean data as possible, such that evicting that same reported
3592  * data will not bring arc_size below arc_c_min. Thus, in certain
3593  * circumstances, the total amount of clean data in the mru and mfu
3594  * lists might not actually be evictable.
3595  *
3596  * The following two distinct cases are accounted for:
3597  *
3598  * 1. The sum of the amount of dirty data contained by both the mru and
3599  *    mfu lists, plus the ARC's other accounting (e.g. the anon list),
3600  *    is greater than or equal to arc_c_min.
3601  *    (i.e. amount of dirty data >= arc_c_min)
3602  *
3603  *    This is the easy case; all clean data contained by the mru and mfu
3604  *    lists is evictable. Evicting all clean data can only drop arc_size
3605  *    to the amount of dirty data, which is greater than arc_c_min.
3606  *
3607  * 2. The sum of the amount of dirty data contained by both the mru and
3608  *    mfu lists, plus the ARC's other accounting (e.g. the anon list),
3609  *    is less than arc_c_min.
3610  *    (i.e. arc_c_min > amount of dirty data)
3611  *
3612  *    2.1. arc_size is greater than or equal arc_c_min.
3613  *         (i.e. arc_size >= arc_c_min > amount of dirty data)
3614  *
3615  *         In this case, not all clean data from the regular mru and mfu
3616  *         lists is actually evictable; we must leave enough clean data
3617  *         to keep arc_size above arc_c_min. Thus, the maximum amount of
3618  *         evictable data from the two lists combined, is exactly the
3619  *         difference between arc_size and arc_c_min.
3620  *
3621  *    2.2. arc_size is less than arc_c_min
3622  *         (i.e. arc_c_min > arc_size > amount of dirty data)
3623  *
3624  *         In this case, none of the data contained in the mru and mfu
3625  *         lists is evictable, even if it's clean. Since arc_size is
3626  *         already below arc_c_min, evicting any more would only
3627  *         increase this negative difference.
3628  */
3629 static uint64_t
3630 arc_evictable_memory(void) {
3631         uint64_t arc_clean =
3632             arc_mru->arcs_lsize[ARC_BUFC_DATA] +
3633             arc_mru->arcs_lsize[ARC_BUFC_METADATA] +
3634             arc_mfu->arcs_lsize[ARC_BUFC_DATA] +
3635             arc_mfu->arcs_lsize[ARC_BUFC_METADATA];
3636         uint64_t ghost_clean =
3637             arc_mru_ghost->arcs_lsize[ARC_BUFC_DATA] +
3638             arc_mru_ghost->arcs_lsize[ARC_BUFC_METADATA] +
3639             arc_mfu_ghost->arcs_lsize[ARC_BUFC_DATA] +
3640             arc_mfu_ghost->arcs_lsize[ARC_BUFC_METADATA];
3641         uint64_t arc_dirty = MAX((int64_t)arc_size - (int64_t)arc_clean, 0);
3642
3643         if (arc_dirty >= arc_c_min)
3644                 return (ghost_clean + arc_clean);
3645
3646         return (ghost_clean + MAX((int64_t)arc_size - (int64_t)arc_c_min, 0));
3647 }
3648
3649 /*
3650  * If sc->nr_to_scan is zero, the caller is requesting a query of the
3651  * number of objects which can potentially be freed.  If it is nonzero,
3652  * the request is to free that many objects.
3653  *
3654  * Linux kernels >= 3.12 have the count_objects and scan_objects callbacks
3655  * in struct shrinker and also require the shrinker to return the number
3656  * of objects freed.
3657  *
3658  * Older kernels require the shrinker to return the number of freeable
3659  * objects following the freeing of nr_to_free.
3660  */
3661 static spl_shrinker_t
3662 __arc_shrinker_func(struct shrinker *shrink, struct shrink_control *sc)
3663 {
3664         int64_t pages;
3665
3666         /* The arc is considered warm once reclaim has occurred */
3667         if (unlikely(arc_warm == B_FALSE))
3668                 arc_warm = B_TRUE;
3669
3670         /* Return the potential number of reclaimable pages */
3671         pages = btop((int64_t)arc_evictable_memory());
3672         if (sc->nr_to_scan == 0)
3673                 return (pages);
3674
3675         /* Not allowed to perform filesystem reclaim */
3676         if (!(sc->gfp_mask & __GFP_FS))
3677                 return (SHRINK_STOP);
3678
3679         /* Reclaim in progress */
3680         if (mutex_tryenter(&arc_reclaim_lock) == 0)
3681                 return (SHRINK_STOP);
3682
3683         mutex_exit(&arc_reclaim_lock);
3684
3685         /*
3686          * Evict the requested number of pages by shrinking arc_c the
3687          * requested amount.  If there is nothing left to evict just
3688          * reap whatever we can from the various arc slabs.
3689          */
3690         if (pages > 0) {
3691                 arc_shrink(ptob(sc->nr_to_scan));
3692                 arc_kmem_reap_now();
3693 #ifdef HAVE_SPLIT_SHRINKER_CALLBACK
3694                 pages = MAX(pages - btop(arc_evictable_memory()), 0);
3695 #else
3696                 pages = btop(arc_evictable_memory());
3697 #endif
3698         } else {
3699                 arc_kmem_reap_now();
3700                 pages = SHRINK_STOP;
3701         }
3702
3703         /*
3704          * We've reaped what we can, wake up threads.
3705          */
3706         cv_broadcast(&arc_reclaim_waiters_cv);
3707
3708         /*
3709          * When direct reclaim is observed it usually indicates a rapid
3710          * increase in memory pressure.  This occurs because the kswapd
3711          * threads were unable to asynchronously keep enough free memory
3712          * available.  In this case set arc_no_grow to briefly pause arc
3713          * growth to avoid compounding the memory pressure.
3714          */
3715         if (current_is_kswapd()) {
3716                 ARCSTAT_BUMP(arcstat_memory_indirect_count);
3717         } else {
3718                 arc_no_grow = B_TRUE;
3719                 arc_grow_time = ddi_get_lbolt() + (zfs_arc_grow_retry * hz);
3720                 ARCSTAT_BUMP(arcstat_memory_direct_count);
3721         }
3722
3723         return (pages);
3724 }
3725 SPL_SHRINKER_CALLBACK_WRAPPER(arc_shrinker_func);
3726
3727 SPL_SHRINKER_DECLARE(arc_shrinker, arc_shrinker_func, DEFAULT_SEEKS);
3728 #endif /* _KERNEL */
3729
3730 /*
3731  * Adapt arc info given the number of bytes we are trying to add and
3732  * the state that we are comming from.  This function is only called
3733  * when we are adding new content to the cache.
3734  */
3735 static void
3736 arc_adapt(int bytes, arc_state_t *state)
3737 {
3738         int mult;
3739         uint64_t arc_p_min = (arc_c >> arc_p_min_shift);
3740         int64_t mrug_size = refcount_count(&arc_mru_ghost->arcs_size);
3741         int64_t mfug_size = refcount_count(&arc_mfu_ghost->arcs_size);
3742
3743         if (state == arc_l2c_only)
3744                 return;
3745
3746         ASSERT(bytes > 0);
3747         /*
3748          * Adapt the target size of the MRU list:
3749          *      - if we just hit in the MRU ghost list, then increase
3750          *        the target size of the MRU list.
3751          *      - if we just hit in the MFU ghost list, then increase
3752          *        the target size of the MFU list by decreasing the
3753          *        target size of the MRU list.
3754          */
3755         if (state == arc_mru_ghost) {
3756                 mult = (mrug_size >= mfug_size) ? 1 : (mfug_size / mrug_size);
3757                 if (!zfs_arc_p_dampener_disable)
3758                         mult = MIN(mult, 10); /* avoid wild arc_p adjustment */
3759
3760                 arc_p = MIN(arc_c - arc_p_min, arc_p + bytes * mult);
3761         } else if (state == arc_mfu_ghost) {
3762                 uint64_t delta;
3763
3764                 mult = (mfug_size >= mrug_size) ? 1 : (mrug_size / mfug_size);
3765                 if (!zfs_arc_p_dampener_disable)
3766                         mult = MIN(mult, 10);
3767
3768                 delta = MIN(bytes * mult, arc_p);
3769                 arc_p = MAX(arc_p_min, arc_p - delta);
3770         }
3771         ASSERT((int64_t)arc_p >= 0);
3772
3773         if (arc_reclaim_needed()) {
3774                 cv_signal(&arc_reclaim_thread_cv);
3775                 return;
3776         }
3777
3778         if (arc_no_grow)
3779                 return;
3780
3781         if (arc_c >= arc_c_max)
3782                 return;
3783
3784         /*
3785          * If we're within (2 * maxblocksize) bytes of the target
3786          * cache size, increment the target cache size
3787          */
3788         VERIFY3U(arc_c, >=, 2ULL << SPA_MAXBLOCKSHIFT);
3789         if (arc_size >= arc_c - (2ULL << SPA_MAXBLOCKSHIFT)) {
3790                 atomic_add_64(&arc_c, (int64_t)bytes);
3791                 if (arc_c > arc_c_max)
3792                         arc_c = arc_c_max;
3793                 else if (state == arc_anon)
3794                         atomic_add_64(&arc_p, (int64_t)bytes);
3795                 if (arc_p > arc_c)
3796                         arc_p = arc_c;
3797         }
3798         ASSERT((int64_t)arc_p >= 0);
3799 }
3800
3801 /*
3802  * Check if arc_size has grown past our upper threshold, determined by
3803  * zfs_arc_overflow_shift.
3804  */
3805 static boolean_t
3806 arc_is_overflowing(void)
3807 {
3808         /* Always allow at least one block of overflow */
3809         uint64_t overflow = MAX(SPA_MAXBLOCKSIZE,
3810             arc_c >> zfs_arc_overflow_shift);
3811
3812         return (arc_size >= arc_c + overflow);
3813 }
3814
3815 /*
3816  * The buffer, supplied as the first argument, needs a data block. If we
3817  * are hitting the hard limit for the cache size, we must sleep, waiting
3818  * for the eviction thread to catch up. If we're past the target size
3819  * but below the hard limit, we'll only signal the reclaim thread and
3820  * continue on.
3821  */
3822 static void
3823 arc_get_data_buf(arc_buf_t *buf)
3824 {
3825         arc_state_t             *state = buf->b_hdr->b_l1hdr.b_state;
3826         uint64_t                size = buf->b_hdr->b_size;
3827         arc_buf_contents_t      type = arc_buf_type(buf->b_hdr);
3828
3829         arc_adapt(size, state);
3830
3831         /*
3832          * If arc_size is currently overflowing, and has grown past our
3833          * upper limit, we must be adding data faster than the evict
3834          * thread can evict. Thus, to ensure we don't compound the
3835          * problem by adding more data and forcing arc_size to grow even
3836          * further past it's target size, we halt and wait for the
3837          * eviction thread to catch up.
3838          *
3839          * It's also possible that the reclaim thread is unable to evict
3840          * enough buffers to get arc_size below the overflow limit (e.g.
3841          * due to buffers being un-evictable, or hash lock collisions).
3842          * In this case, we want to proceed regardless if we're
3843          * overflowing; thus we don't use a while loop here.
3844          */
3845         if (arc_is_overflowing()) {
3846                 mutex_enter(&arc_reclaim_lock);
3847
3848                 /*
3849                  * Now that we've acquired the lock, we may no longer be
3850                  * over the overflow limit, lets check.
3851                  *
3852                  * We're ignoring the case of spurious wake ups. If that
3853                  * were to happen, it'd let this thread consume an ARC
3854                  * buffer before it should have (i.e. before we're under
3855                  * the overflow limit and were signalled by the reclaim
3856                  * thread). As long as that is a rare occurrence, it
3857                  * shouldn't cause any harm.
3858                  */
3859                 if (arc_is_overflowing()) {
3860                         cv_signal(&arc_reclaim_thread_cv);
3861                         cv_wait(&arc_reclaim_waiters_cv, &arc_reclaim_lock);
3862                 }
3863
3864                 mutex_exit(&arc_reclaim_lock);
3865         }
3866
3867         if (type == ARC_BUFC_METADATA) {
3868                 buf->b_data = zio_buf_alloc(size);
3869                 arc_space_consume(size, ARC_SPACE_META);
3870         } else {
3871                 ASSERT(type == ARC_BUFC_DATA);
3872                 buf->b_data = zio_data_buf_alloc(size);
3873                 arc_space_consume(size, ARC_SPACE_DATA);
3874         }
3875
3876         /*
3877          * Update the state size.  Note that ghost states have a
3878          * "ghost size" and so don't need to be updated.
3879          */
3880         if (!GHOST_STATE(buf->b_hdr->b_l1hdr.b_state)) {
3881                 arc_buf_hdr_t *hdr = buf->b_hdr;
3882                 arc_state_t *state = hdr->b_l1hdr.b_state;
3883
3884                 (void) refcount_add_many(&state->arcs_size, size, buf);
3885
3886                 /*
3887                  * If this is reached via arc_read, the link is
3888                  * protected by the hash lock. If reached via
3889                  * arc_buf_alloc, the header should not be accessed by
3890                  * any other thread. And, if reached via arc_read_done,
3891                  * the hash lock will protect it if it's found in the
3892                  * hash table; otherwise no other thread should be
3893                  * trying to [add|remove]_reference it.
3894                  */
3895                 if (multilist_link_active(&hdr->b_l1hdr.b_arc_node)) {
3896                         ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
3897                         atomic_add_64(&hdr->b_l1hdr.b_state->arcs_lsize[type],
3898                             size);
3899                 }
3900                 /*
3901                  * If we are growing the cache, and we are adding anonymous
3902                  * data, and we have outgrown arc_p, update arc_p
3903                  */
3904                 if (arc_size < arc_c && hdr->b_l1hdr.b_state == arc_anon &&
3905                     (refcount_count(&arc_anon->arcs_size) +
3906                     refcount_count(&arc_mru->arcs_size) > arc_p))
3907                         arc_p = MIN(arc_c, arc_p + size);
3908         }
3909 }
3910
3911 /*
3912  * This routine is called whenever a buffer is accessed.
3913  * NOTE: the hash lock is dropped in this function.
3914  */
3915 static void
3916 arc_access(arc_buf_hdr_t *hdr, kmutex_t *hash_lock)
3917 {
3918         clock_t now;
3919
3920         ASSERT(MUTEX_HELD(hash_lock));
3921         ASSERT(HDR_HAS_L1HDR(hdr));
3922
3923         if (hdr->b_l1hdr.b_state == arc_anon) {
3924                 /*
3925                  * This buffer is not in the cache, and does not
3926                  * appear in our "ghost" list.  Add the new buffer
3927                  * to the MRU state.
3928                  */
3929
3930                 ASSERT0(hdr->b_l1hdr.b_arc_access);
3931                 hdr->b_l1hdr.b_arc_access = ddi_get_lbolt();
3932                 DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, hdr);
3933                 arc_change_state(arc_mru, hdr, hash_lock);
3934
3935         } else if (hdr->b_l1hdr.b_state == arc_mru) {
3936                 now = ddi_get_lbolt();
3937
3938                 /*
3939                  * If this buffer is here because of a prefetch, then either:
3940                  * - clear the flag if this is a "referencing" read
3941                  *   (any subsequent access will bump this into the MFU state).
3942                  * or
3943                  * - move the buffer to the head of the list if this is
3944                  *   another prefetch (to make it less likely to be evicted).
3945                  */
3946                 if (HDR_PREFETCH(hdr)) {
3947                         if (refcount_count(&hdr->b_l1hdr.b_refcnt) == 0) {
3948                                 /* link protected by hash lock */
3949                                 ASSERT(multilist_link_active(
3950                                     &hdr->b_l1hdr.b_arc_node));
3951                         } else {
3952                                 hdr->b_flags &= ~ARC_FLAG_PREFETCH;
3953                                 atomic_inc_32(&hdr->b_l1hdr.b_mru_hits);
3954                                 ARCSTAT_BUMP(arcstat_mru_hits);
3955                         }
3956                         hdr->b_l1hdr.b_arc_access = now;
3957                         return;
3958                 }
3959
3960                 /*
3961                  * This buffer has been "accessed" only once so far,
3962                  * but it is still in the cache. Move it to the MFU
3963                  * state.
3964                  */
3965                 if (ddi_time_after(now, hdr->b_l1hdr.b_arc_access +
3966                     ARC_MINTIME)) {
3967                         /*
3968                          * More than 125ms have passed since we
3969                          * instantiated this buffer.  Move it to the
3970                          * most frequently used state.
3971                          */
3972                         hdr->b_l1hdr.b_arc_access = now;
3973                         DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr);
3974                         arc_change_state(arc_mfu, hdr, hash_lock);
3975                 }
3976                 atomic_inc_32(&hdr->b_l1hdr.b_mru_hits);
3977                 ARCSTAT_BUMP(arcstat_mru_hits);
3978         } else if (hdr->b_l1hdr.b_state == arc_mru_ghost) {
3979                 arc_state_t     *new_state;
3980                 /*
3981                  * This buffer has been "accessed" recently, but
3982                  * was evicted from the cache.  Move it to the
3983                  * MFU state.
3984                  */
3985
3986                 if (HDR_PREFETCH(hdr)) {
3987                         new_state = arc_mru;
3988                         if (refcount_count(&hdr->b_l1hdr.b_refcnt) > 0)
3989                                 hdr->b_flags &= ~ARC_FLAG_PREFETCH;
3990                         DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, hdr);
3991                 } else {
3992                         new_state = arc_mfu;
3993                         DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr);
3994                 }
3995
3996                 hdr->b_l1hdr.b_arc_access = ddi_get_lbolt();
3997                 arc_change_state(new_state, hdr, hash_lock);
3998
3999                 atomic_inc_32(&hdr->b_l1hdr.b_mru_ghost_hits);
4000                 ARCSTAT_BUMP(arcstat_mru_ghost_hits);
4001         } else if (hdr->b_l1hdr.b_state == arc_mfu) {
4002                 /*
4003                  * This buffer has been accessed more than once and is
4004                  * still in the cache.  Keep it in the MFU state.
4005                  *
4006                  * NOTE: an add_reference() that occurred when we did
4007                  * the arc_read() will have kicked this off the list.
4008                  * If it was a prefetch, we will explicitly move it to
4009                  * the head of the list now.
4010                  */
4011                 if ((HDR_PREFETCH(hdr)) != 0) {
4012                         ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
4013                         /* link protected by hash_lock */
4014                         ASSERT(multilist_link_active(&hdr->b_l1hdr.b_arc_node));
4015                 }
4016                 atomic_inc_32(&hdr->b_l1hdr.b_mfu_hits);
4017                 ARCSTAT_BUMP(arcstat_mfu_hits);
4018                 hdr->b_l1hdr.b_arc_access = ddi_get_lbolt();
4019         } else if (hdr->b_l1hdr.b_state == arc_mfu_ghost) {
4020                 arc_state_t     *new_state = arc_mfu;
4021                 /*
4022                  * This buffer has been accessed more than once but has
4023                  * been evicted from the cache.  Move it back to the
4024                  * MFU state.
4025                  */
4026
4027                 if (HDR_PREFETCH(hdr)) {
4028                         /*
4029                          * This is a prefetch access...
4030                          * move this block back to the MRU state.
4031                          */
4032                         ASSERT0(refcount_count(&hdr->b_l1hdr.b_refcnt));
4033                         new_state = arc_mru;
4034                 }
4035
4036                 hdr->b_l1hdr.b_arc_access = ddi_get_lbolt();
4037                 DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr);
4038                 arc_change_state(new_state, hdr, hash_lock);
4039
4040                 atomic_inc_32(&hdr->b_l1hdr.b_mfu_ghost_hits);
4041                 ARCSTAT_BUMP(arcstat_mfu_ghost_hits);
4042         } else if (hdr->b_l1hdr.b_state == arc_l2c_only) {
4043                 /*
4044                  * This buffer is on the 2nd Level ARC.
4045                  */
4046
4047                 hdr->b_l1hdr.b_arc_access = ddi_get_lbolt();
4048                 DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr);
4049                 arc_change_state(arc_mfu, hdr, hash_lock);
4050         } else {
4051                 cmn_err(CE_PANIC, "invalid arc state 0x%p",
4052                     hdr->b_l1hdr.b_state);
4053         }
4054 }
4055
4056 /* a generic arc_done_func_t which you can use */
4057 /* ARGSUSED */
4058 void
4059 arc_bcopy_func(zio_t *zio, arc_buf_t *buf, void *arg)
4060 {
4061         if (zio == NULL || zio->io_error == 0)
4062                 bcopy(buf->b_data, arg, buf->b_hdr->b_size);
4063         VERIFY(arc_buf_remove_ref(buf, arg));
4064 }
4065
4066 /* a generic arc_done_func_t */
4067 void
4068 arc_getbuf_func(zio_t *zio, arc_buf_t *buf, void *arg)
4069 {
4070         arc_buf_t **bufp = arg;
4071         if (zio && zio->io_error) {
4072                 VERIFY(arc_buf_remove_ref(buf, arg));
4073                 *bufp = NULL;
4074         } else {
4075                 *bufp = buf;
4076                 ASSERT(buf->b_data);
4077         }
4078 }
4079
4080 static void
4081 arc_read_done(zio_t *zio)
4082 {
4083         arc_buf_hdr_t   *hdr;
4084         arc_buf_t       *buf;
4085         arc_buf_t       *abuf;  /* buffer we're assigning to callback */
4086         kmutex_t        *hash_lock = NULL;
4087         arc_callback_t  *callback_list, *acb;
4088         int             freeable = FALSE;
4089
4090         buf = zio->io_private;
4091         hdr = buf->b_hdr;
4092
4093         /*
4094          * The hdr was inserted into hash-table and removed from lists
4095          * prior to starting I/O.  We should find this header, since
4096          * it's in the hash table, and it should be legit since it's
4097          * not possible to evict it during the I/O.  The only possible
4098          * reason for it not to be found is if we were freed during the
4099          * read.
4100          */
4101         if (HDR_IN_HASH_TABLE(hdr)) {
4102                 arc_buf_hdr_t *found;
4103
4104                 ASSERT3U(hdr->b_birth, ==, BP_PHYSICAL_BIRTH(zio->io_bp));
4105                 ASSERT3U(hdr->b_dva.dva_word[0], ==,
4106                     BP_IDENTITY(zio->io_bp)->dva_word[0]);
4107                 ASSERT3U(hdr->b_dva.dva_word[1], ==,
4108                     BP_IDENTITY(zio->io_bp)->dva_word[1]);
4109
4110                 found = buf_hash_find(hdr->b_spa, zio->io_bp,
4111                     &hash_lock);
4112
4113                 ASSERT((found == NULL && HDR_FREED_IN_READ(hdr) &&
4114                     hash_lock == NULL) ||
4115                     (found == hdr &&
4116                     DVA_EQUAL(&hdr->b_dva, BP_IDENTITY(zio->io_bp))) ||
4117                     (found == hdr && HDR_L2_READING(hdr)));
4118         }
4119
4120         hdr->b_flags &= ~ARC_FLAG_L2_EVICTED;
4121         if (l2arc_noprefetch && HDR_PREFETCH(hdr))
4122                 hdr->b_flags &= ~ARC_FLAG_L2CACHE;
4123
4124         /* byteswap if necessary */
4125         callback_list = hdr->b_l1hdr.b_acb;
4126         ASSERT(callback_list != NULL);
4127         if (BP_SHOULD_BYTESWAP(zio->io_bp) && zio->io_error == 0) {
4128                 dmu_object_byteswap_t bswap =
4129                     DMU_OT_BYTESWAP(BP_GET_TYPE(zio->io_bp));
4130                 if (BP_GET_LEVEL(zio->io_bp) > 0)
4131                     byteswap_uint64_array(buf->b_data, hdr->b_size);
4132                 else
4133                     dmu_ot_byteswap[bswap].ob_func(buf->b_data, hdr->b_size);
4134         }
4135
4136         arc_cksum_compute(buf, B_FALSE);
4137         arc_buf_watch(buf);
4138
4139         if (hash_lock && zio->io_error == 0 &&
4140             hdr->b_l1hdr.b_state == arc_anon) {
4141                 /*
4142                  * Only call arc_access on anonymous buffers.  This is because
4143                  * if we've issued an I/O for an evicted buffer, we've already
4144                  * called arc_access (to prevent any simultaneous readers from
4145                  * getting confused).
4146                  */
4147                 arc_access(hdr, hash_lock);
4148         }
4149
4150         /* create copies of the data buffer for the callers */
4151         abuf = buf;
4152         for (acb = callback_list; acb; acb = acb->acb_next) {
4153                 if (acb->acb_done) {
4154                         if (abuf == NULL) {
4155                                 ARCSTAT_BUMP(arcstat_duplicate_reads);
4156                                 abuf = arc_buf_clone(buf);
4157                         }
4158                         acb->acb_buf = abuf;
4159                         abuf = NULL;
4160                 }
4161         }
4162         hdr->b_l1hdr.b_acb = NULL;
4163         hdr->b_flags &= ~ARC_FLAG_IO_IN_PROGRESS;
4164         ASSERT(!HDR_BUF_AVAILABLE(hdr));
4165         if (abuf == buf) {
4166                 ASSERT(buf->b_efunc == NULL);
4167                 ASSERT(hdr->b_l1hdr.b_datacnt == 1);
4168                 hdr->b_flags |= ARC_FLAG_BUF_AVAILABLE;
4169         }
4170
4171         ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt) ||
4172             callback_list != NULL);
4173
4174         if (zio->io_error != 0) {
4175                 hdr->b_flags |= ARC_FLAG_IO_ERROR;
4176                 if (hdr->b_l1hdr.b_state != arc_anon)
4177                         arc_change_state(arc_anon, hdr, hash_lock);
4178                 if (HDR_IN_HASH_TABLE(hdr))
4179                         buf_hash_remove(hdr);
4180                 freeable = refcount_is_zero(&hdr->b_l1hdr.b_refcnt);
4181         }
4182
4183         /*
4184          * Broadcast before we drop the hash_lock to avoid the possibility
4185          * that the hdr (and hence the cv) might be freed before we get to
4186          * the cv_broadcast().
4187          */
4188         cv_broadcast(&hdr->b_l1hdr.b_cv);
4189
4190         if (hash_lock != NULL) {
4191                 mutex_exit(hash_lock);
4192         } else {
4193                 /*
4194                  * This block was freed while we waited for the read to
4195                  * complete.  It has been removed from the hash table and
4196                  * moved to the anonymous state (so that it won't show up
4197                  * in the cache).
4198                  */
4199                 ASSERT3P(hdr->b_l1hdr.b_state, ==, arc_anon);
4200                 freeable = refcount_is_zero(&hdr->b_l1hdr.b_refcnt);
4201         }
4202
4203         /* execute each callback and free its structure */
4204         while ((acb = callback_list) != NULL) {
4205                 if (acb->acb_done)
4206                         acb->acb_done(zio, acb->acb_buf, acb->acb_private);
4207
4208                 if (acb->acb_zio_dummy != NULL) {
4209                         acb->acb_zio_dummy->io_error = zio->io_error;
4210                         zio_nowait(acb->acb_zio_dummy);
4211                 }
4212
4213                 callback_list = acb->acb_next;
4214                 kmem_free(acb, sizeof (arc_callback_t));
4215         }
4216
4217         if (freeable)
4218                 arc_hdr_destroy(hdr);
4219 }
4220
4221 /*
4222  * "Read" the block at the specified DVA (in bp) via the
4223  * cache.  If the block is found in the cache, invoke the provided
4224  * callback immediately and return.  Note that the `zio' parameter
4225  * in the callback will be NULL in this case, since no IO was
4226  * required.  If the block is not in the cache pass the read request
4227  * on to the spa with a substitute callback function, so that the
4228  * requested block will be added to the cache.
4229  *
4230  * If a read request arrives for a block that has a read in-progress,
4231  * either wait for the in-progress read to complete (and return the
4232  * results); or, if this is a read with a "done" func, add a record
4233  * to the read to invoke the "done" func when the read completes,
4234  * and return; or just return.
4235  *
4236  * arc_read_done() will invoke all the requested "done" functions
4237  * for readers of this block.
4238  */
4239 int
4240 arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_done_func_t *done,
4241     void *private, zio_priority_t priority, int zio_flags,
4242     arc_flags_t *arc_flags, const zbookmark_phys_t *zb)
4243 {
4244         arc_buf_hdr_t *hdr = NULL;
4245         arc_buf_t *buf = NULL;
4246         kmutex_t *hash_lock = NULL;
4247         zio_t *rzio;
4248         uint64_t guid = spa_load_guid(spa);
4249         int rc = 0;
4250
4251         ASSERT(!BP_IS_EMBEDDED(bp) ||
4252             BPE_GET_ETYPE(bp) == BP_EMBEDDED_TYPE_DATA);
4253
4254 top:
4255         if (!BP_IS_EMBEDDED(bp)) {
4256                 /*
4257                  * Embedded BP's have no DVA and require no I/O to "read".
4258                  * Create an anonymous arc buf to back it.
4259                  */
4260                 hdr = buf_hash_find(guid, bp, &hash_lock);
4261         }
4262
4263         if (hdr != NULL && HDR_HAS_L1HDR(hdr) && hdr->b_l1hdr.b_datacnt > 0) {
4264
4265                 *arc_flags |= ARC_FLAG_CACHED;
4266
4267                 if (HDR_IO_IN_PROGRESS(hdr)) {
4268
4269                         if (*arc_flags & ARC_FLAG_WAIT) {
4270                                 cv_wait(&hdr->b_l1hdr.b_cv, hash_lock);
4271                                 mutex_exit(hash_lock);
4272                                 goto top;
4273                         }
4274                         ASSERT(*arc_flags & ARC_FLAG_NOWAIT);
4275
4276                         if (done) {
4277                                 arc_callback_t  *acb = NULL;
4278
4279                                 acb = kmem_zalloc(sizeof (arc_callback_t),
4280                                     KM_SLEEP);
4281                                 acb->acb_done = done;
4282                                 acb->acb_private = private;
4283                                 if (pio != NULL)
4284                                         acb->acb_zio_dummy = zio_null(pio,
4285                                             spa, NULL, NULL, NULL, zio_flags);
4286
4287                                 ASSERT(acb->acb_done != NULL);
4288                                 acb->acb_next = hdr->b_l1hdr.b_acb;
4289                                 hdr->b_l1hdr.b_acb = acb;
4290                                 add_reference(hdr, hash_lock, private);
4291                                 mutex_exit(hash_lock);
4292                                 goto out;
4293                         }
4294                         mutex_exit(hash_lock);
4295                         goto out;
4296                 }
4297
4298                 ASSERT(hdr->b_l1hdr.b_state == arc_mru ||
4299                     hdr->b_l1hdr.b_state == arc_mfu);
4300
4301                 if (done) {
4302                         add_reference(hdr, hash_lock, private);
4303                         /*
4304                          * If this block is already in use, create a new
4305                          * copy of the data so that we will be guaranteed
4306                          * that arc_release() will always succeed.
4307                          */
4308                         buf = hdr->b_l1hdr.b_buf;
4309                         ASSERT(buf);
4310                         ASSERT(buf->b_data);
4311                         if (HDR_BUF_AVAILABLE(hdr)) {
4312                                 ASSERT(buf->b_efunc == NULL);
4313                                 hdr->b_flags &= ~ARC_FLAG_BUF_AVAILABLE;
4314                         } else {
4315                                 buf = arc_buf_clone(buf);
4316                         }
4317
4318                 } else if (*arc_flags & ARC_FLAG_PREFETCH &&
4319                     refcount_count(&hdr->b_l1hdr.b_refcnt) == 0) {
4320                         hdr->b_flags |= ARC_FLAG_PREFETCH;
4321                 }
4322                 DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr);
4323                 arc_access(hdr, hash_lock);
4324                 if (*arc_flags & ARC_FLAG_L2CACHE)
4325                         hdr->b_flags |= ARC_FLAG_L2CACHE;
4326                 if (*arc_flags & ARC_FLAG_L2COMPRESS)
4327                         hdr->b_flags |= ARC_FLAG_L2COMPRESS;
4328                 mutex_exit(hash_lock);
4329                 ARCSTAT_BUMP(arcstat_hits);
4330                 ARCSTAT_CONDSTAT(!HDR_PREFETCH(hdr),
4331                     demand, prefetch, !HDR_ISTYPE_METADATA(hdr),
4332                     data, metadata, hits);
4333
4334                 if (done)
4335                         done(NULL, buf, private);
4336         } else {
4337                 uint64_t size = BP_GET_LSIZE(bp);
4338                 arc_callback_t *acb;
4339                 vdev_t *vd = NULL;
4340                 uint64_t addr = 0;
4341                 boolean_t devw = B_FALSE;
4342                 enum zio_compress b_compress = ZIO_COMPRESS_OFF;
4343                 int32_t b_asize = 0;
4344
4345                 /*
4346                  * Gracefully handle a damaged logical block size as a
4347                  * checksum error by passing a dummy zio to the done callback.
4348                  */
4349                 if (size > spa_maxblocksize(spa)) {
4350                         if (done) {
4351                                 rzio = zio_null(pio, spa, NULL,
4352                                     NULL, NULL, zio_flags);
4353                                 rzio->io_error = ECKSUM;
4354                                 done(rzio, buf, private);
4355                                 zio_nowait(rzio);
4356                         }
4357                         rc = ECKSUM;
4358                         goto out;
4359                 }
4360
4361                 if (hdr == NULL) {
4362                         /* this block is not in the cache */
4363                         arc_buf_hdr_t *exists = NULL;
4364                         arc_buf_contents_t type = BP_GET_BUFC_TYPE(bp);
4365                         buf = arc_buf_alloc(spa, size, private, type);
4366                         hdr = buf->b_hdr;
4367                         if (!BP_IS_EMBEDDED(bp)) {
4368                                 hdr->b_dva = *BP_IDENTITY(bp);
4369                                 hdr->b_birth = BP_PHYSICAL_BIRTH(bp);
4370                                 exists = buf_hash_insert(hdr, &hash_lock);
4371                         }
4372                         if (exists != NULL) {
4373                                 /* somebody beat us to the hash insert */
4374                                 mutex_exit(hash_lock);
4375                                 buf_discard_identity(hdr);
4376                                 (void) arc_buf_remove_ref(buf, private);
4377                                 goto top; /* restart the IO request */
4378                         }
4379
4380                         /* if this is a prefetch, we don't have a reference */
4381                         if (*arc_flags & ARC_FLAG_PREFETCH) {
4382                                 (void) remove_reference(hdr, hash_lock,
4383                                     private);
4384                                 hdr->b_flags |= ARC_FLAG_PREFETCH;
4385                         }
4386                         if (*arc_flags & ARC_FLAG_L2CACHE)
4387                                 hdr->b_flags |= ARC_FLAG_L2CACHE;
4388                         if (*arc_flags & ARC_FLAG_L2COMPRESS)
4389                                 hdr->b_flags |= ARC_FLAG_L2COMPRESS;
4390                         if (BP_GET_LEVEL(bp) > 0)
4391                                 hdr->b_flags |= ARC_FLAG_INDIRECT;
4392                 } else {
4393                         /*
4394                          * This block is in the ghost cache. If it was L2-only
4395                          * (and thus didn't have an L1 hdr), we realloc the
4396                          * header to add an L1 hdr.
4397                          */
4398                         if (!HDR_HAS_L1HDR(hdr)) {
4399                                 hdr = arc_hdr_realloc(hdr, hdr_l2only_cache,
4400                                     hdr_full_cache);
4401                         }
4402
4403                         ASSERT(GHOST_STATE(hdr->b_l1hdr.b_state));
4404                         ASSERT(!HDR_IO_IN_PROGRESS(hdr));
4405                         ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
4406                         ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL);
4407
4408                         /* if this is a prefetch, we don't have a reference */
4409                         if (*arc_flags & ARC_FLAG_PREFETCH)
4410                                 hdr->b_flags |= ARC_FLAG_PREFETCH;
4411                         else
4412                                 add_reference(hdr, hash_lock, private);
4413                         if (*arc_flags & ARC_FLAG_L2CACHE)
4414                                 hdr->b_flags |= ARC_FLAG_L2CACHE;
4415                         if (*arc_flags & ARC_FLAG_L2COMPRESS)
4416                                 hdr->b_flags |= ARC_FLAG_L2COMPRESS;
4417                         buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE);
4418                         buf->b_hdr = hdr;
4419                         buf->b_data = NULL;
4420                         buf->b_efunc = NULL;
4421                         buf->b_private = NULL;
4422                         buf->b_next = NULL;
4423                         hdr->b_l1hdr.b_buf = buf;
4424                         ASSERT0(hdr->b_l1hdr.b_datacnt);
4425                         hdr->b_l1hdr.b_datacnt = 1;
4426                         arc_get_data_buf(buf);
4427                         arc_access(hdr, hash_lock);
4428                 }
4429
4430                 ASSERT(!GHOST_STATE(hdr->b_l1hdr.b_state));
4431
4432                 acb = kmem_zalloc(sizeof (arc_callback_t), KM_SLEEP);
4433                 acb->acb_done = done;
4434                 acb->acb_private = private;
4435
4436                 ASSERT(hdr->b_l1hdr.b_acb == NULL);
4437                 hdr->b_l1hdr.b_acb = acb;
4438                 hdr->b_flags |= ARC_FLAG_IO_IN_PROGRESS;
4439
4440                 if (HDR_HAS_L2HDR(hdr) &&
4441                     (vd = hdr->b_l2hdr.b_dev->l2ad_vdev) != NULL) {
4442                         devw = hdr->b_l2hdr.b_dev->l2ad_writing;
4443                         addr = hdr->b_l2hdr.b_daddr;
4444                         b_compress = HDR_GET_COMPRESS(hdr);
4445                         b_asize = hdr->b_l2hdr.b_asize;
4446                         /*
4447                          * Lock out device removal.
4448                          */
4449                         if (vdev_is_dead(vd) ||
4450                             !spa_config_tryenter(spa, SCL_L2ARC, vd, RW_READER))
4451                                 vd = NULL;
4452                 }
4453
4454                 if (hash_lock != NULL)
4455                         mutex_exit(hash_lock);
4456
4457                 /*
4458                  * At this point, we have a level 1 cache miss.  Try again in
4459                  * L2ARC if possible.
4460                  */
4461                 ASSERT3U(hdr->b_size, ==, size);
4462                 DTRACE_PROBE4(arc__miss, arc_buf_hdr_t *, hdr, blkptr_t *, bp,
4463                     uint64_t, size, zbookmark_phys_t *, zb);
4464                 ARCSTAT_BUMP(arcstat_misses);
4465                 ARCSTAT_CONDSTAT(!HDR_PREFETCH(hdr),
4466                     demand, prefetch, !HDR_ISTYPE_METADATA(hdr),
4467                     data, metadata, misses);
4468
4469                 if (vd != NULL && l2arc_ndev != 0 && !(l2arc_norw && devw)) {
4470                         /*
4471                          * Read from the L2ARC if the following are true:
4472                          * 1. The L2ARC vdev was previously cached.
4473                          * 2. This buffer still has L2ARC metadata.
4474                          * 3. This buffer isn't currently writing to the L2ARC.
4475                          * 4. The L2ARC entry wasn't evicted, which may
4476                          *    also have invalidated the vdev.
4477                          * 5. This isn't prefetch and l2arc_noprefetch is set.
4478                          */
4479                         if (HDR_HAS_L2HDR(hdr) &&
4480                             !HDR_L2_WRITING(hdr) && !HDR_L2_EVICTED(hdr) &&
4481                             !(l2arc_noprefetch && HDR_PREFETCH(hdr))) {
4482                                 l2arc_read_callback_t *cb;
4483
4484                                 DTRACE_PROBE1(l2arc__hit, arc_buf_hdr_t *, hdr);
4485                                 ARCSTAT_BUMP(arcstat_l2_hits);
4486                                 atomic_inc_32(&hdr->b_l2hdr.b_hits);
4487
4488                                 cb = kmem_zalloc(sizeof (l2arc_read_callback_t),
4489                                     KM_SLEEP);
4490                                 cb->l2rcb_buf = buf;
4491                                 cb->l2rcb_spa = spa;
4492                                 cb->l2rcb_bp = *bp;
4493                                 cb->l2rcb_zb = *zb;
4494                                 cb->l2rcb_flags = zio_flags;
4495                                 cb->l2rcb_compress = b_compress;
4496
4497                                 ASSERT(addr >= VDEV_LABEL_START_SIZE &&
4498                                     addr + size < vd->vdev_psize -
4499                                     VDEV_LABEL_END_SIZE);
4500
4501                                 /*
4502                                  * l2arc read.  The SCL_L2ARC lock will be
4503                                  * released by l2arc_read_done().
4504                                  * Issue a null zio if the underlying buffer
4505                                  * was squashed to zero size by compression.
4506                                  */
4507                                 if (b_compress == ZIO_COMPRESS_EMPTY) {
4508                                         rzio = zio_null(pio, spa, vd,
4509                                             l2arc_read_done, cb,
4510                                             zio_flags | ZIO_FLAG_DONT_CACHE |
4511                                             ZIO_FLAG_CANFAIL |
4512                                             ZIO_FLAG_DONT_PROPAGATE |
4513                                             ZIO_FLAG_DONT_RETRY);
4514                                 } else {
4515                                         rzio = zio_read_phys(pio, vd, addr,
4516                                             b_asize, buf->b_data,
4517                                             ZIO_CHECKSUM_OFF,
4518                                             l2arc_read_done, cb, priority,
4519                                             zio_flags | ZIO_FLAG_DONT_CACHE |
4520                                             ZIO_FLAG_CANFAIL |
4521                                             ZIO_FLAG_DONT_PROPAGATE |
4522                                             ZIO_FLAG_DONT_RETRY, B_FALSE);
4523                                 }
4524                                 DTRACE_PROBE2(l2arc__read, vdev_t *, vd,
4525                                     zio_t *, rzio);
4526                                 ARCSTAT_INCR(arcstat_l2_read_bytes, b_asize);
4527
4528                                 if (*arc_flags & ARC_FLAG_NOWAIT) {
4529                                         zio_nowait(rzio);
4530                                         goto out;
4531                                 }
4532
4533                                 ASSERT(*arc_flags & ARC_FLAG_WAIT);
4534                                 if (zio_wait(rzio) == 0)
4535                                         goto out;
4536
4537                                 /* l2arc read error; goto zio_read() */
4538                         } else {
4539                                 DTRACE_PROBE1(l2arc__miss,
4540                                     arc_buf_hdr_t *, hdr);
4541                                 ARCSTAT_BUMP(arcstat_l2_misses);
4542                                 if (HDR_L2_WRITING(hdr))
4543                                         ARCSTAT_BUMP(arcstat_l2_rw_clash);
4544                                 spa_config_exit(spa, SCL_L2ARC, vd);
4545                         }
4546                 } else {
4547                         if (vd != NULL)
4548                                 spa_config_exit(spa, SCL_L2ARC, vd);
4549                         if (l2arc_ndev != 0) {
4550                                 DTRACE_PROBE1(l2arc__miss,
4551                                     arc_buf_hdr_t *, hdr);
4552                                 ARCSTAT_BUMP(arcstat_l2_misses);
4553                         }
4554                 }
4555
4556                 rzio = zio_read(pio, spa, bp, buf->b_data, size,
4557                     arc_read_done, buf, priority, zio_flags, zb);
4558
4559                 if (*arc_flags & ARC_FLAG_WAIT) {
4560                         rc = zio_wait(rzio);
4561                         goto out;
4562                 }
4563
4564                 ASSERT(*arc_flags & ARC_FLAG_NOWAIT);
4565                 zio_nowait(rzio);
4566         }
4567
4568 out:
4569         spa_read_history_add(spa, zb, *arc_flags);
4570         return (rc);
4571 }
4572
4573 arc_prune_t *
4574 arc_add_prune_callback(arc_prune_func_t *func, void *private)
4575 {
4576         arc_prune_t *p;
4577
4578         p = kmem_alloc(sizeof (*p), KM_SLEEP);
4579         p->p_pfunc = func;
4580         p->p_private = private;
4581         list_link_init(&p->p_node);
4582         refcount_create(&p->p_refcnt);
4583
4584         mutex_enter(&arc_prune_mtx);
4585         refcount_add(&p->p_refcnt, &arc_prune_list);
4586         list_insert_head(&arc_prune_list, p);
4587         mutex_exit(&arc_prune_mtx);
4588
4589         return (p);
4590 }
4591
4592 void
4593 arc_remove_prune_callback(arc_prune_t *p)
4594 {
4595         mutex_enter(&arc_prune_mtx);
4596         list_remove(&arc_prune_list, p);
4597         if (refcount_remove(&p->p_refcnt, &arc_prune_list) == 0) {
4598                 refcount_destroy(&p->p_refcnt);
4599                 kmem_free(p, sizeof (*p));
4600         }
4601         mutex_exit(&arc_prune_mtx);
4602 }
4603
4604 void
4605 arc_set_callback(arc_buf_t *buf, arc_evict_func_t *func, void *private)
4606 {
4607         ASSERT(buf->b_hdr != NULL);
4608         ASSERT(buf->b_hdr->b_l1hdr.b_state != arc_anon);
4609         ASSERT(!refcount_is_zero(&buf->b_hdr->b_l1hdr.b_refcnt) ||
4610             func == NULL);
4611         ASSERT(buf->b_efunc == NULL);
4612         ASSERT(!HDR_BUF_AVAILABLE(buf->b_hdr));
4613
4614         buf->b_efunc = func;
4615         buf->b_private = private;
4616 }
4617
4618 /*
4619  * Notify the arc that a block was freed, and thus will never be used again.
4620  */
4621 void
4622 arc_freed(spa_t *spa, const blkptr_t *bp)
4623 {
4624         arc_buf_hdr_t *hdr;
4625         kmutex_t *hash_lock;
4626         uint64_t guid = spa_load_guid(spa);
4627
4628         ASSERT(!BP_IS_EMBEDDED(bp));
4629
4630         hdr = buf_hash_find(guid, bp, &hash_lock);
4631         if (hdr == NULL)
4632                 return;
4633         if (HDR_BUF_AVAILABLE(hdr)) {
4634                 arc_buf_t *buf = hdr->b_l1hdr.b_buf;
4635                 add_reference(hdr, hash_lock, FTAG);
4636                 hdr->b_flags &= ~ARC_FLAG_BUF_AVAILABLE;
4637                 mutex_exit(hash_lock);
4638
4639                 arc_release(buf, FTAG);
4640                 (void) arc_buf_remove_ref(buf, FTAG);
4641         } else {
4642                 mutex_exit(hash_lock);
4643         }
4644
4645 }
4646
4647 /*
4648  * Clear the user eviction callback set by arc_set_callback(), first calling
4649  * it if it exists.  Because the presence of a callback keeps an arc_buf cached
4650  * clearing the callback may result in the arc_buf being destroyed.  However,
4651  * it will not result in the *last* arc_buf being destroyed, hence the data
4652  * will remain cached in the ARC. We make a copy of the arc buffer here so
4653  * that we can process the callback without holding any locks.
4654  *
4655  * It's possible that the callback is already in the process of being cleared
4656  * by another thread.  In this case we can not clear the callback.
4657  *
4658  * Returns B_TRUE if the callback was successfully called and cleared.
4659  */
4660 boolean_t
4661 arc_clear_callback(arc_buf_t *buf)
4662 {
4663         arc_buf_hdr_t *hdr;
4664         kmutex_t *hash_lock;
4665         arc_evict_func_t *efunc = buf->b_efunc;
4666         void *private = buf->b_private;
4667
4668         mutex_enter(&buf->b_evict_lock);
4669         hdr = buf->b_hdr;
4670         if (hdr == NULL) {
4671                 /*
4672                  * We are in arc_do_user_evicts().
4673                  */
4674                 ASSERT(buf->b_data == NULL);
4675                 mutex_exit(&buf->b_evict_lock);
4676                 return (B_FALSE);
4677         } else if (buf->b_data == NULL) {
4678                 /*
4679                  * We are on the eviction list; process this buffer now
4680                  * but let arc_do_user_evicts() do the reaping.
4681                  */
4682                 buf->b_efunc = NULL;
4683                 mutex_exit(&buf->b_evict_lock);
4684                 VERIFY0(efunc(private));
4685                 return (B_TRUE);
4686         }
4687         hash_lock = HDR_LOCK(hdr);
4688         mutex_enter(hash_lock);
4689         hdr = buf->b_hdr;
4690         ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
4691
4692         ASSERT3U(refcount_count(&hdr->b_l1hdr.b_refcnt), <,
4693             hdr->b_l1hdr.b_datacnt);
4694         ASSERT(hdr->b_l1hdr.b_state == arc_mru ||
4695             hdr->b_l1hdr.b_state == arc_mfu);
4696
4697         buf->b_efunc = NULL;
4698         buf->b_private = NULL;
4699
4700         if (hdr->b_l1hdr.b_datacnt > 1) {
4701                 mutex_exit(&buf->b_evict_lock);
4702                 arc_buf_destroy(buf, TRUE);
4703         } else {
4704                 ASSERT(buf == hdr->b_l1hdr.b_buf);
4705                 hdr->b_flags |= ARC_FLAG_BUF_AVAILABLE;
4706                 mutex_exit(&buf->b_evict_lock);
4707         }
4708
4709         mutex_exit(hash_lock);
4710         VERIFY0(efunc(private));
4711         return (B_TRUE);
4712 }
4713
4714 /*
4715  * Release this buffer from the cache, making it an anonymous buffer.  This
4716  * must be done after a read and prior to modifying the buffer contents.
4717  * If the buffer has more than one reference, we must make
4718  * a new hdr for the buffer.
4719  */
4720 void
4721 arc_release(arc_buf_t *buf, void *tag)
4722 {
4723         kmutex_t *hash_lock;
4724         arc_state_t *state;
4725         arc_buf_hdr_t *hdr = buf->b_hdr;
4726
4727         /*
4728          * It would be nice to assert that if its DMU metadata (level >
4729          * 0 || it's the dnode file), then it must be syncing context.
4730          * But we don't know that information at this level.
4731          */
4732
4733         mutex_enter(&buf->b_evict_lock);
4734
4735         ASSERT(HDR_HAS_L1HDR(hdr));
4736
4737         /*
4738          * We don't grab the hash lock prior to this check, because if
4739          * the buffer's header is in the arc_anon state, it won't be
4740          * linked into the hash table.
4741          */
4742         if (hdr->b_l1hdr.b_state == arc_anon) {
4743                 mutex_exit(&buf->b_evict_lock);
4744                 ASSERT(!HDR_IO_IN_PROGRESS(hdr));
4745                 ASSERT(!HDR_IN_HASH_TABLE(hdr));
4746                 ASSERT(!HDR_HAS_L2HDR(hdr));
4747                 ASSERT(BUF_EMPTY(hdr));
4748
4749                 ASSERT3U(hdr->b_l1hdr.b_datacnt, ==, 1);
4750                 ASSERT3S(refcount_count(&hdr->b_l1hdr.b_refcnt), ==, 1);
4751                 ASSERT(!list_link_active(&hdr->b_l1hdr.b_arc_node));
4752
4753                 ASSERT3P(buf->b_efunc, ==, NULL);
4754                 ASSERT3P(buf->b_private, ==, NULL);
4755
4756                 hdr->b_l1hdr.b_arc_access = 0;
4757                 arc_buf_thaw(buf);
4758
4759                 return;
4760         }
4761
4762         hash_lock = HDR_LOCK(hdr);
4763         mutex_enter(hash_lock);
4764
4765         /*
4766          * This assignment is only valid as long as the hash_lock is
4767          * held, we must be careful not to reference state or the
4768          * b_state field after dropping the lock.
4769          */
4770         state = hdr->b_l1hdr.b_state;
4771         ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
4772         ASSERT3P(state, !=, arc_anon);
4773
4774         /* this buffer is not on any list */
4775         ASSERT(refcount_count(&hdr->b_l1hdr.b_refcnt) > 0);
4776
4777         if (HDR_HAS_L2HDR(hdr)) {
4778                 mutex_enter(&hdr->b_l2hdr.b_dev->l2ad_mtx);
4779
4780                 /*
4781                  * We have to recheck this conditional again now that
4782                  * we're holding the l2ad_mtx to prevent a race with
4783                  * another thread which might be concurrently calling
4784                  * l2arc_evict(). In that case, l2arc_evict() might have
4785                  * destroyed the header's L2 portion as we were waiting
4786                  * to acquire the l2ad_mtx.
4787                  */
4788                 if (HDR_HAS_L2HDR(hdr))
4789                         arc_hdr_l2hdr_destroy(hdr);
4790
4791                 mutex_exit(&hdr->b_l2hdr.b_dev->l2ad_mtx);
4792         }
4793
4794         /*
4795          * Do we have more than one buf?
4796          */
4797         if (hdr->b_l1hdr.b_datacnt > 1) {
4798                 arc_buf_hdr_t *nhdr;
4799                 arc_buf_t **bufp;
4800                 uint64_t blksz = hdr->b_size;
4801                 uint64_t spa = hdr->b_spa;
4802                 arc_buf_contents_t type = arc_buf_type(hdr);
4803                 uint32_t flags = hdr->b_flags;
4804
4805                 ASSERT(hdr->b_l1hdr.b_buf != buf || buf->b_next != NULL);
4806                 /*
4807                  * Pull the data off of this hdr and attach it to
4808                  * a new anonymous hdr.
4809                  */
4810                 (void) remove_reference(hdr, hash_lock, tag);
4811                 bufp = &hdr->b_l1hdr.b_buf;
4812                 while (*bufp != buf)
4813                         bufp = &(*bufp)->b_next;
4814                 *bufp = buf->b_next;
4815                 buf->b_next = NULL;
4816
4817                 ASSERT3P(state, !=, arc_l2c_only);
4818
4819                 (void) refcount_remove_many(
4820                     &state->arcs_size, hdr->b_size, buf);
4821
4822                 if (refcount_is_zero(&hdr->b_l1hdr.b_refcnt)) {
4823                         uint64_t *size;
4824
4825                         ASSERT3P(state, !=, arc_l2c_only);
4826                         size = &state->arcs_lsize[type];
4827                         ASSERT3U(*size, >=, hdr->b_size);
4828                         atomic_add_64(size, -hdr->b_size);
4829                 }
4830
4831                 /*
4832                  * We're releasing a duplicate user data buffer, update
4833                  * our statistics accordingly.
4834                  */
4835                 if (HDR_ISTYPE_DATA(hdr)) {
4836                         ARCSTAT_BUMPDOWN(arcstat_duplicate_buffers);
4837                         ARCSTAT_INCR(arcstat_duplicate_buffers_size,
4838                             -hdr->b_size);
4839                 }
4840                 hdr->b_l1hdr.b_datacnt -= 1;
4841                 arc_cksum_verify(buf);
4842                 arc_buf_unwatch(buf);
4843
4844                 mutex_exit(hash_lock);
4845
4846                 nhdr = kmem_cache_alloc(hdr_full_cache, KM_PUSHPAGE);
4847                 nhdr->b_size = blksz;
4848                 nhdr->b_spa = spa;
4849
4850                 nhdr->b_l1hdr.b_mru_hits = 0;
4851                 nhdr->b_l1hdr.b_mru_ghost_hits = 0;
4852                 nhdr->b_l1hdr.b_mfu_hits = 0;
4853                 nhdr->b_l1hdr.b_mfu_ghost_hits = 0;
4854                 nhdr->b_l1hdr.b_l2_hits = 0;
4855                 nhdr->b_flags = flags & ARC_FLAG_L2_WRITING;
4856                 nhdr->b_flags |= arc_bufc_to_flags(type);
4857                 nhdr->b_flags |= ARC_FLAG_HAS_L1HDR;
4858
4859                 nhdr->b_l1hdr.b_buf = buf;
4860                 nhdr->b_l1hdr.b_datacnt = 1;
4861                 nhdr->b_l1hdr.b_state = arc_anon;
4862                 nhdr->b_l1hdr.b_arc_access = 0;
4863                 nhdr->b_l1hdr.b_tmp_cdata = NULL;
4864                 nhdr->b_freeze_cksum = NULL;
4865
4866                 (void) refcount_add(&nhdr->b_l1hdr.b_refcnt, tag);
4867                 buf->b_hdr = nhdr;
4868                 mutex_exit(&buf->b_evict_lock);
4869                 (void) refcount_add_many(&arc_anon->arcs_size, blksz, buf);
4870         } else {
4871                 mutex_exit(&buf->b_evict_lock);
4872                 ASSERT(refcount_count(&hdr->b_l1hdr.b_refcnt) == 1);
4873                 /* protected by hash lock, or hdr is on arc_anon */
4874                 ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node));
4875                 ASSERT(!HDR_IO_IN_PROGRESS(hdr));
4876                 hdr->b_l1hdr.b_mru_hits = 0;
4877                 hdr->b_l1hdr.b_mru_ghost_hits = 0;
4878                 hdr->b_l1hdr.b_mfu_hits = 0;
4879                 hdr->b_l1hdr.b_mfu_ghost_hits = 0;
4880                 hdr->b_l1hdr.b_l2_hits = 0;
4881                 arc_change_state(arc_anon, hdr, hash_lock);
4882                 hdr->b_l1hdr.b_arc_access = 0;
4883                 mutex_exit(hash_lock);
4884
4885                 buf_discard_identity(hdr);
4886                 arc_buf_thaw(buf);
4887         }
4888         buf->b_efunc = NULL;
4889         buf->b_private = NULL;
4890 }
4891
4892 int
4893 arc_released(arc_buf_t *buf)
4894 {
4895         int released;
4896
4897         mutex_enter(&buf->b_evict_lock);
4898         released = (buf->b_data != NULL &&
4899             buf->b_hdr->b_l1hdr.b_state == arc_anon);
4900         mutex_exit(&buf->b_evict_lock);
4901         return (released);
4902 }
4903
4904 #ifdef ZFS_DEBUG
4905 int
4906 arc_referenced(arc_buf_t *buf)
4907 {
4908         int referenced;
4909
4910         mutex_enter(&buf->b_evict_lock);
4911         referenced = (refcount_count(&buf->b_hdr->b_l1hdr.b_refcnt));
4912         mutex_exit(&buf->b_evict_lock);
4913         return (referenced);
4914 }
4915 #endif
4916
4917 static void
4918 arc_write_ready(zio_t *zio)
4919 {
4920         arc_write_callback_t *callback = zio->io_private;
4921         arc_buf_t *buf = callback->awcb_buf;
4922         arc_buf_hdr_t *hdr = buf->b_hdr;
4923
4924         ASSERT(HDR_HAS_L1HDR(hdr));
4925         ASSERT(!refcount_is_zero(&buf->b_hdr->b_l1hdr.b_refcnt));
4926         ASSERT(hdr->b_l1hdr.b_datacnt > 0);
4927         callback->awcb_ready(zio, buf, callback->awcb_private);
4928
4929         /*
4930          * If the IO is already in progress, then this is a re-write
4931          * attempt, so we need to thaw and re-compute the cksum.
4932          * It is the responsibility of the callback to handle the
4933          * accounting for any re-write attempt.
4934          */
4935         if (HDR_IO_IN_PROGRESS(hdr)) {
4936                 mutex_enter(&hdr->b_l1hdr.b_freeze_lock);
4937                 if (hdr->b_freeze_cksum != NULL) {
4938                         kmem_free(hdr->b_freeze_cksum, sizeof (zio_cksum_t));
4939                         hdr->b_freeze_cksum = NULL;
4940                 }
4941                 mutex_exit(&hdr->b_l1hdr.b_freeze_lock);
4942         }
4943         arc_cksum_compute(buf, B_FALSE);
4944         hdr->b_flags |= ARC_FLAG_IO_IN_PROGRESS;
4945 }
4946
4947 /*
4948  * The SPA calls this callback for each physical write that happens on behalf
4949  * of a logical write.  See the comment in dbuf_write_physdone() for details.
4950  */
4951 static void
4952 arc_write_physdone(zio_t *zio)
4953 {
4954         arc_write_callback_t *cb = zio->io_private;
4955         if (cb->awcb_physdone != NULL)
4956                 cb->awcb_physdone(zio, cb->awcb_buf, cb->awcb_private);
4957 }
4958
4959 static void
4960 arc_write_done(zio_t *zio)
4961 {
4962         arc_write_callback_t *callback = zio->io_private;
4963         arc_buf_t *buf = callback->awcb_buf;
4964         arc_buf_hdr_t *hdr = buf->b_hdr;
4965
4966         ASSERT(hdr->b_l1hdr.b_acb == NULL);
4967
4968         if (zio->io_error == 0) {
4969                 if (BP_IS_HOLE(zio->io_bp) || BP_IS_EMBEDDED(zio->io_bp)) {
4970                         buf_discard_identity(hdr);
4971                 } else {
4972                         hdr->b_dva = *BP_IDENTITY(zio->io_bp);
4973                         hdr->b_birth = BP_PHYSICAL_BIRTH(zio->io_bp);
4974                 }
4975         } else {
4976                 ASSERT(BUF_EMPTY(hdr));
4977         }
4978
4979         /*
4980          * If the block to be written was all-zero or compressed enough to be
4981          * embedded in the BP, no write was performed so there will be no
4982          * dva/birth/checksum.  The buffer must therefore remain anonymous
4983          * (and uncached).
4984          */
4985         if (!BUF_EMPTY(hdr)) {
4986                 arc_buf_hdr_t *exists;
4987                 kmutex_t *hash_lock;
4988
4989                 ASSERT(zio->io_error == 0);
4990
4991                 arc_cksum_verify(buf);
4992
4993                 exists = buf_hash_insert(hdr, &hash_lock);
4994                 if (exists != NULL) {
4995                         /*
4996                          * This can only happen if we overwrite for
4997                          * sync-to-convergence, because we remove
4998                          * buffers from the hash table when we arc_free().
4999                          */
5000                         if (zio->io_flags & ZIO_FLAG_IO_REWRITE) {
5001                                 if (!BP_EQUAL(&zio->io_bp_orig, zio->io_bp))
5002                                         panic("bad overwrite, hdr=%p exists=%p",
5003                                             (void *)hdr, (void *)exists);
5004                                 ASSERT(refcount_is_zero(
5005                                     &exists->b_l1hdr.b_refcnt));
5006                                 arc_change_state(arc_anon, exists, hash_lock);
5007                                 mutex_exit(hash_lock);
5008                                 arc_hdr_destroy(exists);
5009                                 exists = buf_hash_insert(hdr, &hash_lock);
5010                                 ASSERT3P(exists, ==, NULL);
5011                         } else if (zio->io_flags & ZIO_FLAG_NOPWRITE) {
5012                                 /* nopwrite */
5013                                 ASSERT(zio->io_prop.zp_nopwrite);
5014                                 if (!BP_EQUAL(&zio->io_bp_orig, zio->io_bp))
5015                                         panic("bad nopwrite, hdr=%p exists=%p",
5016                                             (void *)hdr, (void *)exists);
5017                         } else {
5018                                 /* Dedup */
5019                                 ASSERT(hdr->b_l1hdr.b_datacnt == 1);
5020                                 ASSERT(hdr->b_l1hdr.b_state == arc_anon);
5021                                 ASSERT(BP_GET_DEDUP(zio->io_bp));
5022                                 ASSERT(BP_GET_LEVEL(zio->io_bp) == 0);
5023                         }
5024                 }
5025                 hdr->b_flags &= ~ARC_FLAG_IO_IN_PROGRESS;
5026                 /* if it's not anon, we are doing a scrub */
5027                 if (exists == NULL && hdr->b_l1hdr.b_state == arc_anon)
5028                         arc_access(hdr, hash_lock);
5029                 mutex_exit(hash_lock);
5030         } else {
5031                 hdr->b_flags &= ~ARC_FLAG_IO_IN_PROGRESS;
5032         }
5033
5034         ASSERT(!refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
5035         callback->awcb_done(zio, buf, callback->awcb_private);
5036
5037         kmem_free(callback, sizeof (arc_write_callback_t));
5038 }
5039
5040 zio_t *
5041 arc_write(zio_t *pio, spa_t *spa, uint64_t txg,
5042     blkptr_t *bp, arc_buf_t *buf, boolean_t l2arc, boolean_t l2arc_compress,
5043     const zio_prop_t *zp, arc_done_func_t *ready, arc_done_func_t *physdone,
5044     arc_done_func_t *done, void *private, zio_priority_t priority,
5045     int zio_flags, const zbookmark_phys_t *zb)
5046 {
5047         arc_buf_hdr_t *hdr = buf->b_hdr;
5048         arc_write_callback_t *callback;
5049         zio_t *zio;
5050
5051         ASSERT(ready != NULL);
5052         ASSERT(done != NULL);
5053         ASSERT(!HDR_IO_ERROR(hdr));
5054         ASSERT(!HDR_IO_IN_PROGRESS(hdr));
5055         ASSERT(hdr->b_l1hdr.b_acb == NULL);
5056         ASSERT(hdr->b_l1hdr.b_datacnt > 0);
5057         if (l2arc)
5058                 hdr->b_flags |= ARC_FLAG_L2CACHE;
5059         if (l2arc_compress)
5060                 hdr->b_flags |= ARC_FLAG_L2COMPRESS;
5061         callback = kmem_zalloc(sizeof (arc_write_callback_t), KM_SLEEP);
5062         callback->awcb_ready = ready;
5063         callback->awcb_physdone = physdone;
5064         callback->awcb_done = done;
5065         callback->awcb_private = private;
5066         callback->awcb_buf = buf;
5067
5068         zio = zio_write(pio, spa, txg, bp, buf->b_data, hdr->b_size, zp,
5069             arc_write_ready, arc_write_physdone, arc_write_done, callback,
5070             priority, zio_flags, zb);
5071
5072         return (zio);
5073 }
5074
5075 static int
5076 arc_memory_throttle(uint64_t reserve, uint64_t txg)
5077 {
5078 #ifdef _KERNEL
5079         if (zfs_arc_memory_throttle_disable)
5080                 return (0);
5081
5082         if (freemem > physmem * arc_lotsfree_percent / 100)
5083                 return (0);
5084
5085         if (arc_reclaim_needed()) {
5086                 /* memory is low, delay before restarting */
5087                 ARCSTAT_INCR(arcstat_memory_throttle_count, 1);
5088                 DMU_TX_STAT_BUMP(dmu_tx_memory_reclaim);
5089                 return (SET_ERROR(EAGAIN));
5090         }
5091 #endif
5092         return (0);
5093 }
5094
5095 void
5096 arc_tempreserve_clear(uint64_t reserve)
5097 {
5098         atomic_add_64(&arc_tempreserve, -reserve);
5099         ASSERT((int64_t)arc_tempreserve >= 0);
5100 }
5101
5102 int
5103 arc_tempreserve_space(uint64_t reserve, uint64_t txg)
5104 {
5105         int error;
5106         uint64_t anon_size;
5107
5108         if (reserve > arc_c/4 && !arc_no_grow)
5109                 arc_c = MIN(arc_c_max, reserve * 4);
5110
5111         /*
5112          * Throttle when the calculated memory footprint for the TXG
5113          * exceeds the target ARC size.
5114          */
5115         if (reserve > arc_c) {
5116                 DMU_TX_STAT_BUMP(dmu_tx_memory_reserve);
5117                 return (SET_ERROR(ERESTART));
5118         }
5119
5120         /*
5121          * Don't count loaned bufs as in flight dirty data to prevent long
5122          * network delays from blocking transactions that are ready to be
5123          * assigned to a txg.
5124          */
5125         anon_size = MAX((int64_t)(refcount_count(&arc_anon->arcs_size) -
5126             arc_loaned_bytes), 0);
5127
5128         /*
5129          * Writes will, almost always, require additional memory allocations
5130          * in order to compress/encrypt/etc the data.  We therefore need to
5131          * make sure that there is sufficient available memory for this.
5132          */
5133         error = arc_memory_throttle(reserve, txg);
5134         if (error != 0)
5135                 return (error);
5136
5137         /*
5138          * Throttle writes when the amount of dirty data in the cache
5139          * gets too large.  We try to keep the cache less than half full
5140          * of dirty blocks so that our sync times don't grow too large.
5141          * Note: if two requests come in concurrently, we might let them
5142          * both succeed, when one of them should fail.  Not a huge deal.
5143          */
5144
5145         if (reserve + arc_tempreserve + anon_size > arc_c / 2 &&
5146             anon_size > arc_c / 4) {
5147                 dprintf("failing, arc_tempreserve=%lluK anon_meta=%lluK "
5148                     "anon_data=%lluK tempreserve=%lluK arc_c=%lluK\n",
5149                     arc_tempreserve>>10,
5150                     arc_anon->arcs_lsize[ARC_BUFC_METADATA]>>10,
5151                     arc_anon->arcs_lsize[ARC_BUFC_DATA]>>10,
5152                     reserve>>10, arc_c>>10);
5153                 DMU_TX_STAT_BUMP(dmu_tx_dirty_throttle);
5154                 return (SET_ERROR(ERESTART));
5155         }
5156         atomic_add_64(&arc_tempreserve, reserve);
5157         return (0);
5158 }
5159
5160 static void
5161 arc_kstat_update_state(arc_state_t *state, kstat_named_t *size,
5162     kstat_named_t *evict_data, kstat_named_t *evict_metadata)
5163 {
5164         size->value.ui64 = refcount_count(&state->arcs_size);
5165         evict_data->value.ui64 = state->arcs_lsize[ARC_BUFC_DATA];
5166         evict_metadata->value.ui64 = state->arcs_lsize[ARC_BUFC_METADATA];
5167 }
5168
5169 static int
5170 arc_kstat_update(kstat_t *ksp, int rw)
5171 {
5172         arc_stats_t *as = ksp->ks_data;
5173
5174         if (rw == KSTAT_WRITE) {
5175                 return (EACCES);
5176         } else {
5177                 arc_kstat_update_state(arc_anon,
5178                     &as->arcstat_anon_size,
5179                     &as->arcstat_anon_evictable_data,
5180                     &as->arcstat_anon_evictable_metadata);
5181                 arc_kstat_update_state(arc_mru,
5182                     &as->arcstat_mru_size,
5183                     &as->arcstat_mru_evictable_data,
5184                     &as->arcstat_mru_evictable_metadata);
5185                 arc_kstat_update_state(arc_mru_ghost,
5186                     &as->arcstat_mru_ghost_size,
5187                     &as->arcstat_mru_ghost_evictable_data,
5188                     &as->arcstat_mru_ghost_evictable_metadata);
5189                 arc_kstat_update_state(arc_mfu,
5190                     &as->arcstat_mfu_size,
5191                     &as->arcstat_mfu_evictable_data,
5192                     &as->arcstat_mfu_evictable_metadata);
5193                 arc_kstat_update_state(arc_mfu_ghost,
5194                     &as->arcstat_mfu_ghost_size,
5195                     &as->arcstat_mfu_ghost_evictable_data,
5196                     &as->arcstat_mfu_ghost_evictable_metadata);
5197         }
5198
5199         return (0);
5200 }
5201
5202 /*
5203  * This function *must* return indices evenly distributed between all
5204  * sublists of the multilist. This is needed due to how the ARC eviction
5205  * code is laid out; arc_evict_state() assumes ARC buffers are evenly
5206  * distributed between all sublists and uses this assumption when
5207  * deciding which sublist to evict from and how much to evict from it.
5208  */
5209 unsigned int
5210 arc_state_multilist_index_func(multilist_t *ml, void *obj)
5211 {
5212         arc_buf_hdr_t *hdr = obj;
5213
5214         /*
5215          * We rely on b_dva to generate evenly distributed index
5216          * numbers using buf_hash below. So, as an added precaution,
5217          * let's make sure we never add empty buffers to the arc lists.
5218          */
5219         ASSERT(!BUF_EMPTY(hdr));
5220
5221         /*
5222          * The assumption here, is the hash value for a given
5223          * arc_buf_hdr_t will remain constant throughout its lifetime
5224          * (i.e. its b_spa, b_dva, and b_birth fields don't change).
5225          * Thus, we don't need to store the header's sublist index
5226          * on insertion, as this index can be recalculated on removal.
5227          *
5228          * Also, the low order bits of the hash value are thought to be
5229          * distributed evenly. Otherwise, in the case that the multilist
5230          * has a power of two number of sublists, each sublists' usage
5231          * would not be evenly distributed.
5232          */
5233         return (buf_hash(hdr->b_spa, &hdr->b_dva, hdr->b_birth) %
5234             multilist_get_num_sublists(ml));
5235 }
5236
5237 /*
5238  * Called during module initialization and periodically thereafter to
5239  * apply reasonable changes to the exposed performance tunings.  Non-zero
5240  * zfs_* values which differ from the currently set values will be applied.
5241  */
5242 static void
5243 arc_tuning_update(void)
5244 {
5245         /* Valid range: 64M - <all physical memory> */
5246         if ((zfs_arc_max) && (zfs_arc_max != arc_c_max) &&
5247             (zfs_arc_max > 64 << 20) && (zfs_arc_max < ptob(physmem)) &&
5248             (zfs_arc_max > arc_c_min)) {
5249                 arc_c_max = zfs_arc_max;
5250                 arc_c = arc_c_max;
5251                 arc_p = (arc_c >> 1);
5252                 arc_meta_limit = MIN(arc_meta_limit, arc_c_max);
5253         }
5254
5255         /* Valid range: 32M - <arc_c_max> */
5256         if ((zfs_arc_min) && (zfs_arc_min != arc_c_min) &&
5257             (zfs_arc_min >= 2ULL << SPA_MAXBLOCKSHIFT) &&
5258             (zfs_arc_min <= arc_c_max)) {
5259                 arc_c_min = zfs_arc_min;
5260                 arc_c = MAX(arc_c, arc_c_min);
5261         }
5262
5263         /* Valid range: 16M - <arc_c_max> */
5264         if ((zfs_arc_meta_min) && (zfs_arc_meta_min != arc_meta_min) &&
5265             (zfs_arc_meta_min >= 1ULL << SPA_MAXBLOCKSHIFT) &&
5266             (zfs_arc_meta_min <= arc_c_max)) {
5267                 arc_meta_min = zfs_arc_meta_min;
5268                 arc_meta_limit = MAX(arc_meta_limit, arc_meta_min);
5269         }
5270
5271         /* Valid range: <arc_meta_min> - <arc_c_max> */
5272         if ((zfs_arc_meta_limit) && (zfs_arc_meta_limit != arc_meta_limit) &&
5273             (zfs_arc_meta_limit >= zfs_arc_meta_min) &&
5274             (zfs_arc_meta_limit <= arc_c_max))
5275                 arc_meta_limit = zfs_arc_meta_limit;
5276
5277         /* Valid range: 1 - N */
5278         if (zfs_arc_grow_retry)
5279                 arc_grow_retry = zfs_arc_grow_retry;
5280
5281         /* Valid range: 1 - N */
5282         if (zfs_arc_shrink_shift) {
5283                 arc_shrink_shift = zfs_arc_shrink_shift;
5284                 arc_no_grow_shift = MIN(arc_no_grow_shift, arc_shrink_shift -1);
5285         }
5286
5287         /* Valid range: 1 - N */
5288         if (zfs_arc_p_min_shift)
5289                 arc_p_min_shift = zfs_arc_p_min_shift;
5290
5291         /* Valid range: 1 - N ticks */
5292         if (zfs_arc_min_prefetch_lifespan)
5293                 arc_min_prefetch_lifespan = zfs_arc_min_prefetch_lifespan;
5294 }
5295
5296 void
5297 arc_init(void)
5298 {
5299         /*
5300          * allmem is "all memory that we could possibly use".
5301          */
5302 #ifdef _KERNEL
5303         uint64_t allmem = ptob(physmem);
5304 #else
5305         uint64_t allmem = (physmem * PAGESIZE) / 2;
5306 #endif
5307
5308         mutex_init(&arc_reclaim_lock, NULL, MUTEX_DEFAULT, NULL);
5309         cv_init(&arc_reclaim_thread_cv, NULL, CV_DEFAULT, NULL);
5310         cv_init(&arc_reclaim_waiters_cv, NULL, CV_DEFAULT, NULL);
5311
5312         mutex_init(&arc_user_evicts_lock, NULL, MUTEX_DEFAULT, NULL);
5313         cv_init(&arc_user_evicts_cv, NULL, CV_DEFAULT, NULL);
5314
5315         /* Convert seconds to clock ticks */
5316         arc_min_prefetch_lifespan = 1 * hz;
5317
5318         /* Start out with 1/8 of all memory */
5319         arc_c = allmem / 8;
5320
5321 #ifdef _KERNEL
5322         /*
5323          * On architectures where the physical memory can be larger
5324          * than the addressable space (intel in 32-bit mode), we may
5325          * need to limit the cache to 1/8 of VM size.
5326          */
5327         arc_c = MIN(arc_c, vmem_size(heap_arena, VMEM_ALLOC | VMEM_FREE) / 8);
5328
5329         /*
5330          * Register a shrinker to support synchronous (direct) memory
5331          * reclaim from the arc.  This is done to prevent kswapd from
5332          * swapping out pages when it is preferable to shrink the arc.
5333          */
5334         spl_register_shrinker(&arc_shrinker);
5335 #endif
5336
5337         /* Set min cache to allow safe operation of arc_adapt() */
5338         arc_c_min = 2ULL << SPA_MAXBLOCKSHIFT;
5339         /* Set max to 1/2 of all memory */
5340         arc_c_max = allmem / 2;
5341
5342         arc_c = arc_c_max;
5343         arc_p = (arc_c >> 1);
5344
5345         /* Set min to 1/2 of arc_c_min */
5346         arc_meta_min = 1ULL << SPA_MAXBLOCKSHIFT;
5347         /* Initialize maximum observed usage to zero */
5348         arc_meta_max = 0;
5349         /* Set limit to 3/4 of arc_c_max with a floor of arc_meta_min */
5350         arc_meta_limit = MAX((3 * arc_c_max) / 4, arc_meta_min);
5351
5352         /* Apply user specified tunings */
5353         arc_tuning_update();
5354
5355         if (zfs_arc_num_sublists_per_state < 1)
5356                 zfs_arc_num_sublists_per_state = MAX(boot_ncpus, 1);
5357
5358         /* if kmem_flags are set, lets try to use less memory */
5359         if (kmem_debugging())
5360                 arc_c = arc_c / 2;
5361         if (arc_c < arc_c_min)
5362                 arc_c = arc_c_min;
5363
5364         arc_anon = &ARC_anon;
5365         arc_mru = &ARC_mru;
5366         arc_mru_ghost = &ARC_mru_ghost;
5367         arc_mfu = &ARC_mfu;
5368         arc_mfu_ghost = &ARC_mfu_ghost;
5369         arc_l2c_only = &ARC_l2c_only;
5370         arc_size = 0;
5371
5372         multilist_create(&arc_mru->arcs_list[ARC_BUFC_METADATA],
5373             sizeof (arc_buf_hdr_t),
5374             offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node),
5375             zfs_arc_num_sublists_per_state, arc_state_multilist_index_func);
5376         multilist_create(&arc_mru->arcs_list[ARC_BUFC_DATA],
5377             sizeof (arc_buf_hdr_t),
5378             offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node),
5379             zfs_arc_num_sublists_per_state, arc_state_multilist_index_func);
5380         multilist_create(&arc_mru_ghost->arcs_list[ARC_BUFC_METADATA],
5381             sizeof (arc_buf_hdr_t),
5382             offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node),
5383             zfs_arc_num_sublists_per_state, arc_state_multilist_index_func);
5384         multilist_create(&arc_mru_ghost->arcs_list[ARC_BUFC_DATA],
5385             sizeof (arc_buf_hdr_t),
5386             offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node),
5387             zfs_arc_num_sublists_per_state, arc_state_multilist_index_func);
5388         multilist_create(&arc_mfu->arcs_list[ARC_BUFC_METADATA],
5389             sizeof (arc_buf_hdr_t),
5390             offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node),
5391             zfs_arc_num_sublists_per_state, arc_state_multilist_index_func);
5392         multilist_create(&arc_mfu->arcs_list[ARC_BUFC_DATA],
5393             sizeof (arc_buf_hdr_t),
5394             offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node),
5395             zfs_arc_num_sublists_per_state, arc_state_multilist_index_func);
5396         multilist_create(&arc_mfu_ghost->arcs_list[ARC_BUFC_METADATA],
5397             sizeof (arc_buf_hdr_t),
5398             offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node),
5399             zfs_arc_num_sublists_per_state, arc_state_multilist_index_func);
5400         multilist_create(&arc_mfu_ghost->arcs_list[ARC_BUFC_DATA],
5401             sizeof (arc_buf_hdr_t),
5402             offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node),
5403             zfs_arc_num_sublists_per_state, arc_state_multilist_index_func);
5404         multilist_create(&arc_l2c_only->arcs_list[ARC_BUFC_METADATA],
5405             sizeof (arc_buf_hdr_t),
5406             offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node),
5407             zfs_arc_num_sublists_per_state, arc_state_multilist_index_func);
5408         multilist_create(&arc_l2c_only->arcs_list[ARC_BUFC_DATA],
5409             sizeof (arc_buf_hdr_t),
5410             offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node),
5411             zfs_arc_num_sublists_per_state, arc_state_multilist_index_func);
5412
5413         arc_anon->arcs_state = ARC_STATE_ANON;
5414         arc_mru->arcs_state = ARC_STATE_MRU;
5415         arc_mru_ghost->arcs_state = ARC_STATE_MRU_GHOST;
5416         arc_mfu->arcs_state = ARC_STATE_MFU;
5417         arc_mfu_ghost->arcs_state = ARC_STATE_MFU_GHOST;
5418         arc_l2c_only->arcs_state = ARC_STATE_L2C_ONLY;
5419
5420         refcount_create(&arc_anon->arcs_size);
5421         refcount_create(&arc_mru->arcs_size);
5422         refcount_create(&arc_mru_ghost->arcs_size);
5423         refcount_create(&arc_mfu->arcs_size);
5424         refcount_create(&arc_mfu_ghost->arcs_size);
5425         refcount_create(&arc_l2c_only->arcs_size);
5426
5427         buf_init();
5428
5429         arc_reclaim_thread_exit = FALSE;
5430         arc_user_evicts_thread_exit = FALSE;
5431         list_create(&arc_prune_list, sizeof (arc_prune_t),
5432             offsetof(arc_prune_t, p_node));
5433         arc_eviction_list = NULL;
5434         mutex_init(&arc_prune_mtx, NULL, MUTEX_DEFAULT, NULL);
5435         bzero(&arc_eviction_hdr, sizeof (arc_buf_hdr_t));
5436
5437         arc_prune_taskq = taskq_create("arc_prune", max_ncpus, minclsyspri,
5438             max_ncpus, INT_MAX, TASKQ_PREPOPULATE | TASKQ_DYNAMIC);
5439
5440         arc_ksp = kstat_create("zfs", 0, "arcstats", "misc", KSTAT_TYPE_NAMED,
5441             sizeof (arc_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL);
5442
5443         if (arc_ksp != NULL) {
5444                 arc_ksp->ks_data = &arc_stats;
5445                 arc_ksp->ks_update = arc_kstat_update;
5446                 kstat_install(arc_ksp);
5447         }
5448
5449         (void) thread_create(NULL, 0, arc_reclaim_thread, NULL, 0, &p0,
5450             TS_RUN, minclsyspri);
5451
5452         (void) thread_create(NULL, 0, arc_user_evicts_thread, NULL, 0, &p0,
5453             TS_RUN, minclsyspri);
5454
5455         arc_dead = FALSE;
5456         arc_warm = B_FALSE;
5457
5458         /*
5459          * Calculate maximum amount of dirty data per pool.
5460          *
5461          * If it has been set by a module parameter, take that.
5462          * Otherwise, use a percentage of physical memory defined by
5463          * zfs_dirty_data_max_percent (default 10%) with a cap at
5464          * zfs_dirty_data_max_max (default 25% of physical memory).
5465          */
5466         if (zfs_dirty_data_max_max == 0)
5467                 zfs_dirty_data_max_max = physmem * PAGESIZE *
5468                     zfs_dirty_data_max_max_percent / 100;
5469
5470         if (zfs_dirty_data_max == 0) {
5471                 zfs_dirty_data_max = physmem * PAGESIZE *
5472                     zfs_dirty_data_max_percent / 100;
5473                 zfs_dirty_data_max = MIN(zfs_dirty_data_max,
5474                     zfs_dirty_data_max_max);
5475         }
5476 }
5477
5478 void
5479 arc_fini(void)
5480 {
5481         arc_prune_t *p;
5482
5483 #ifdef _KERNEL
5484         spl_unregister_shrinker(&arc_shrinker);
5485 #endif /* _KERNEL */
5486
5487         mutex_enter(&arc_reclaim_lock);
5488         arc_reclaim_thread_exit = TRUE;
5489         /*
5490          * The reclaim thread will set arc_reclaim_thread_exit back to
5491          * FALSE when it is finished exiting; we're waiting for that.
5492          */
5493         while (arc_reclaim_thread_exit) {
5494                 cv_signal(&arc_reclaim_thread_cv);
5495                 cv_wait(&arc_reclaim_thread_cv, &arc_reclaim_lock);
5496         }
5497         mutex_exit(&arc_reclaim_lock);
5498
5499         mutex_enter(&arc_user_evicts_lock);
5500         arc_user_evicts_thread_exit = TRUE;
5501         /*
5502          * The user evicts thread will set arc_user_evicts_thread_exit
5503          * to FALSE when it is finished exiting; we're waiting for that.
5504          */
5505         while (arc_user_evicts_thread_exit) {
5506                 cv_signal(&arc_user_evicts_cv);
5507                 cv_wait(&arc_user_evicts_cv, &arc_user_evicts_lock);
5508         }
5509         mutex_exit(&arc_user_evicts_lock);
5510
5511         /* Use TRUE to ensure *all* buffers are evicted */
5512         arc_flush(NULL, TRUE);
5513
5514         arc_dead = TRUE;
5515
5516         if (arc_ksp != NULL) {
5517                 kstat_delete(arc_ksp);
5518                 arc_ksp = NULL;
5519         }
5520
5521         taskq_wait(arc_prune_taskq);
5522         taskq_destroy(arc_prune_taskq);
5523
5524         mutex_enter(&arc_prune_mtx);
5525         while ((p = list_head(&arc_prune_list)) != NULL) {
5526                 list_remove(&arc_prune_list, p);
5527                 refcount_remove(&p->p_refcnt, &arc_prune_list);
5528                 refcount_destroy(&p->p_refcnt);
5529                 kmem_free(p, sizeof (*p));
5530         }
5531         mutex_exit(&arc_prune_mtx);
5532
5533         list_destroy(&arc_prune_list);
5534         mutex_destroy(&arc_prune_mtx);
5535         mutex_destroy(&arc_reclaim_lock);
5536         cv_destroy(&arc_reclaim_thread_cv);
5537         cv_destroy(&arc_reclaim_waiters_cv);
5538
5539         mutex_destroy(&arc_user_evicts_lock);
5540         cv_destroy(&arc_user_evicts_cv);
5541
5542         refcount_destroy(&arc_anon->arcs_size);
5543         refcount_destroy(&arc_mru->arcs_size);
5544         refcount_destroy(&arc_mru_ghost->arcs_size);
5545         refcount_destroy(&arc_mfu->arcs_size);
5546         refcount_destroy(&arc_mfu_ghost->arcs_size);
5547         refcount_destroy(&arc_l2c_only->arcs_size);
5548
5549         multilist_destroy(&arc_mru->arcs_list[ARC_BUFC_METADATA]);
5550         multilist_destroy(&arc_mru_ghost->arcs_list[ARC_BUFC_METADATA]);
5551         multilist_destroy(&arc_mfu->arcs_list[ARC_BUFC_METADATA]);
5552         multilist_destroy(&arc_mfu_ghost->arcs_list[ARC_BUFC_METADATA]);
5553         multilist_destroy(&arc_mru->arcs_list[ARC_BUFC_DATA]);
5554         multilist_destroy(&arc_mru_ghost->arcs_list[ARC_BUFC_DATA]);
5555         multilist_destroy(&arc_mfu->arcs_list[ARC_BUFC_DATA]);
5556         multilist_destroy(&arc_mfu_ghost->arcs_list[ARC_BUFC_DATA]);
5557         multilist_destroy(&arc_l2c_only->arcs_list[ARC_BUFC_METADATA]);
5558         multilist_destroy(&arc_l2c_only->arcs_list[ARC_BUFC_DATA]);
5559
5560         buf_fini();
5561
5562         ASSERT0(arc_loaned_bytes);
5563 }
5564
5565 /*
5566  * Level 2 ARC
5567  *
5568  * The level 2 ARC (L2ARC) is a cache layer in-between main memory and disk.
5569  * It uses dedicated storage devices to hold cached data, which are populated
5570  * using large infrequent writes.  The main role of this cache is to boost
5571  * the performance of random read workloads.  The intended L2ARC devices
5572  * include short-stroked disks, solid state disks, and other media with
5573  * substantially faster read latency than disk.
5574  *
5575  *                 +-----------------------+
5576  *                 |         ARC           |
5577  *                 +-----------------------+
5578  *                    |         ^     ^
5579  *                    |         |     |
5580  *      l2arc_feed_thread()    arc_read()
5581  *                    |         |     |
5582  *                    |  l2arc read   |
5583  *                    V         |     |
5584  *               +---------------+    |
5585  *               |     L2ARC     |    |
5586  *               +---------------+    |
5587  *                   |    ^           |
5588  *          l2arc_write() |           |
5589  *                   |    |           |
5590  *                   V    |           |
5591  *                 +-------+      +-------+
5592  *                 | vdev  |      | vdev  |
5593  *                 | cache |      | cache |
5594  *                 +-------+      +-------+
5595  *                 +=========+     .-----.
5596  *                 :  L2ARC  :    |-_____-|
5597  *                 : devices :    | Disks |
5598  *                 +=========+    `-_____-'
5599  *
5600  * Read requests are satisfied from the following sources, in order:
5601  *
5602  *      1) ARC
5603  *      2) vdev cache of L2ARC devices
5604  *      3) L2ARC devices
5605  *      4) vdev cache of disks
5606  *      5) disks
5607  *
5608  * Some L2ARC device types exhibit extremely slow write performance.
5609  * To accommodate for this there are some significant differences between
5610  * the L2ARC and traditional cache design:
5611  *
5612  * 1. There is no eviction path from the ARC to the L2ARC.  Evictions from
5613  * the ARC behave as usual, freeing buffers and placing headers on ghost
5614  * lists.  The ARC does not send buffers to the L2ARC during eviction as
5615  * this would add inflated write latencies for all ARC memory pressure.
5616  *
5617  * 2. The L2ARC attempts to cache data from the ARC before it is evicted.
5618  * It does this by periodically scanning buffers from the eviction-end of
5619  * the MFU and MRU ARC lists, copying them to the L2ARC devices if they are
5620  * not already there. It scans until a headroom of buffers is satisfied,
5621  * which itself is a buffer for ARC eviction. If a compressible buffer is
5622  * found during scanning and selected for writing to an L2ARC device, we
5623  * temporarily boost scanning headroom during the next scan cycle to make
5624  * sure we adapt to compression effects (which might significantly reduce
5625  * the data volume we write to L2ARC). The thread that does this is
5626  * l2arc_feed_thread(), illustrated below; example sizes are included to
5627  * provide a better sense of ratio than this diagram:
5628  *
5629  *             head -->                        tail
5630  *              +---------------------+----------+
5631  *      ARC_mfu |:::::#:::::::::::::::|o#o###o###|-->.   # already on L2ARC
5632  *              +---------------------+----------+   |   o L2ARC eligible
5633  *      ARC_mru |:#:::::::::::::::::::|#o#ooo####|-->|   : ARC buffer
5634  *              +---------------------+----------+   |
5635  *                   15.9 Gbytes      ^ 32 Mbytes    |
5636  *                                 headroom          |
5637  *                                            l2arc_feed_thread()
5638  *                                                   |
5639  *                       l2arc write hand <--[oooo]--'
5640  *                               |           8 Mbyte
5641  *                               |          write max
5642  *                               V
5643  *                +==============================+
5644  *      L2ARC dev |####|#|###|###|    |####| ... |
5645  *                +==============================+
5646  *                           32 Gbytes
5647  *
5648  * 3. If an ARC buffer is copied to the L2ARC but then hit instead of
5649  * evicted, then the L2ARC has cached a buffer much sooner than it probably
5650  * needed to, potentially wasting L2ARC device bandwidth and storage.  It is
5651  * safe to say that this is an uncommon case, since buffers at the end of
5652  * the ARC lists have moved there due to inactivity.
5653  *
5654  * 4. If the ARC evicts faster than the L2ARC can maintain a headroom,
5655  * then the L2ARC simply misses copying some buffers.  This serves as a
5656  * pressure valve to prevent heavy read workloads from both stalling the ARC
5657  * with waits and clogging the L2ARC with writes.  This also helps prevent
5658  * the potential for the L2ARC to churn if it attempts to cache content too
5659  * quickly, such as during backups of the entire pool.
5660  *
5661  * 5. After system boot and before the ARC has filled main memory, there are
5662  * no evictions from the ARC and so the tails of the ARC_mfu and ARC_mru
5663  * lists can remain mostly static.  Instead of searching from tail of these
5664  * lists as pictured, the l2arc_feed_thread() will search from the list heads
5665  * for eligible buffers, greatly increasing its chance of finding them.
5666  *
5667  * The L2ARC device write speed is also boosted during this time so that
5668  * the L2ARC warms up faster.  Since there have been no ARC evictions yet,
5669  * there are no L2ARC reads, and no fear of degrading read performance
5670  * through increased writes.
5671  *
5672  * 6. Writes to the L2ARC devices are grouped and sent in-sequence, so that
5673  * the vdev queue can aggregate them into larger and fewer writes.  Each
5674  * device is written to in a rotor fashion, sweeping writes through
5675  * available space then repeating.
5676  *
5677  * 7. The L2ARC does not store dirty content.  It never needs to flush
5678  * write buffers back to disk based storage.
5679  *
5680  * 8. If an ARC buffer is written (and dirtied) which also exists in the
5681  * L2ARC, the now stale L2ARC buffer is immediately dropped.
5682  *
5683  * The performance of the L2ARC can be tweaked by a number of tunables, which
5684  * may be necessary for different workloads:
5685  *
5686  *      l2arc_write_max         max write bytes per interval
5687  *      l2arc_write_boost       extra write bytes during device warmup
5688  *      l2arc_noprefetch        skip caching prefetched buffers
5689  *      l2arc_nocompress        skip compressing buffers
5690  *      l2arc_headroom          number of max device writes to precache
5691  *      l2arc_headroom_boost    when we find compressed buffers during ARC
5692  *                              scanning, we multiply headroom by this
5693  *                              percentage factor for the next scan cycle,
5694  *                              since more compressed buffers are likely to
5695  *                              be present
5696  *      l2arc_feed_secs         seconds between L2ARC writing
5697  *
5698  * Tunables may be removed or added as future performance improvements are
5699  * integrated, and also may become zpool properties.
5700  *
5701  * There are three key functions that control how the L2ARC warms up:
5702  *
5703  *      l2arc_write_eligible()  check if a buffer is eligible to cache
5704  *      l2arc_write_size()      calculate how much to write
5705  *      l2arc_write_interval()  calculate sleep delay between writes
5706  *
5707  * These three functions determine what to write, how much, and how quickly
5708  * to send writes.
5709  */
5710
5711 static boolean_t
5712 l2arc_write_eligible(uint64_t spa_guid, arc_buf_hdr_t *hdr)
5713 {
5714         /*
5715          * A buffer is *not* eligible for the L2ARC if it:
5716          * 1. belongs to a different spa.
5717          * 2. is already cached on the L2ARC.
5718          * 3. has an I/O in progress (it may be an incomplete read).
5719          * 4. is flagged not eligible (zfs property).
5720          */
5721         if (hdr->b_spa != spa_guid || HDR_HAS_L2HDR(hdr) ||
5722             HDR_IO_IN_PROGRESS(hdr) || !HDR_L2CACHE(hdr))
5723                 return (B_FALSE);
5724
5725         return (B_TRUE);
5726 }
5727
5728 static uint64_t
5729 l2arc_write_size(void)
5730 {
5731         uint64_t size;
5732
5733         /*
5734          * Make sure our globals have meaningful values in case the user
5735          * altered them.
5736          */
5737         size = l2arc_write_max;
5738         if (size == 0) {
5739                 cmn_err(CE_NOTE, "Bad value for l2arc_write_max, value must "
5740                     "be greater than zero, resetting it to the default (%d)",
5741                     L2ARC_WRITE_SIZE);
5742                 size = l2arc_write_max = L2ARC_WRITE_SIZE;
5743         }
5744
5745         if (arc_warm == B_FALSE)
5746                 size += l2arc_write_boost;
5747
5748         return (size);
5749
5750 }
5751
5752 static clock_t
5753 l2arc_write_interval(clock_t began, uint64_t wanted, uint64_t wrote)
5754 {
5755         clock_t interval, next, now;
5756
5757         /*
5758          * If the ARC lists are busy, increase our write rate; if the
5759          * lists are stale, idle back.  This is achieved by checking
5760          * how much we previously wrote - if it was more than half of
5761          * what we wanted, schedule the next write much sooner.
5762          */
5763         if (l2arc_feed_again && wrote > (wanted / 2))
5764                 interval = (hz * l2arc_feed_min_ms) / 1000;
5765         else
5766                 interval = hz * l2arc_feed_secs;
5767
5768         now = ddi_get_lbolt();
5769         next = MAX(now, MIN(now + interval, began + interval));
5770
5771         return (next);
5772 }
5773
5774 /*
5775  * Cycle through L2ARC devices.  This is how L2ARC load balances.
5776  * If a device is returned, this also returns holding the spa config lock.
5777  */
5778 static l2arc_dev_t *
5779 l2arc_dev_get_next(void)
5780 {
5781         l2arc_dev_t *first, *next = NULL;
5782
5783         /*
5784          * Lock out the removal of spas (spa_namespace_lock), then removal
5785          * of cache devices (l2arc_dev_mtx).  Once a device has been selected,
5786          * both locks will be dropped and a spa config lock held instead.
5787          */
5788         mutex_enter(&spa_namespace_lock);
5789         mutex_enter(&l2arc_dev_mtx);
5790
5791         /* if there are no vdevs, there is nothing to do */
5792         if (l2arc_ndev == 0)
5793                 goto out;
5794
5795         first = NULL;
5796         next = l2arc_dev_last;
5797         do {
5798                 /* loop around the list looking for a non-faulted vdev */
5799                 if (next == NULL) {
5800                         next = list_head(l2arc_dev_list);
5801                 } else {
5802                         next = list_next(l2arc_dev_list, next);
5803                         if (next == NULL)
5804                                 next = list_head(l2arc_dev_list);
5805                 }
5806
5807                 /* if we have come back to the start, bail out */
5808                 if (first == NULL)
5809                         first = next;
5810                 else if (next == first)
5811                         break;
5812
5813         } while (vdev_is_dead(next->l2ad_vdev));
5814
5815         /* if we were unable to find any usable vdevs, return NULL */
5816         if (vdev_is_dead(next->l2ad_vdev))
5817                 next = NULL;
5818
5819         l2arc_dev_last = next;
5820
5821 out:
5822         mutex_exit(&l2arc_dev_mtx);
5823
5824         /*
5825          * Grab the config lock to prevent the 'next' device from being
5826          * removed while we are writing to it.
5827          */
5828         if (next != NULL)
5829                 spa_config_enter(next->l2ad_spa, SCL_L2ARC, next, RW_READER);
5830         mutex_exit(&spa_namespace_lock);
5831
5832         return (next);
5833 }
5834
5835 /*
5836  * Free buffers that were tagged for destruction.
5837  */
5838 static void
5839 l2arc_do_free_on_write(void)
5840 {
5841         list_t *buflist;
5842         l2arc_data_free_t *df, *df_prev;
5843
5844         mutex_enter(&l2arc_free_on_write_mtx);
5845         buflist = l2arc_free_on_write;
5846
5847         for (df = list_tail(buflist); df; df = df_prev) {
5848                 df_prev = list_prev(buflist, df);
5849                 ASSERT(df->l2df_data != NULL);
5850                 ASSERT(df->l2df_func != NULL);
5851                 df->l2df_func(df->l2df_data, df->l2df_size);
5852                 list_remove(buflist, df);
5853                 kmem_free(df, sizeof (l2arc_data_free_t));
5854         }
5855
5856         mutex_exit(&l2arc_free_on_write_mtx);
5857 }
5858
5859 /*
5860  * A write to a cache device has completed.  Update all headers to allow
5861  * reads from these buffers to begin.
5862  */
5863 static void
5864 l2arc_write_done(zio_t *zio)
5865 {
5866         l2arc_write_callback_t *cb;
5867         l2arc_dev_t *dev;
5868         list_t *buflist;
5869         arc_buf_hdr_t *head, *hdr, *hdr_prev;
5870         kmutex_t *hash_lock;
5871         int64_t bytes_dropped = 0;
5872
5873         cb = zio->io_private;
5874         ASSERT(cb != NULL);
5875         dev = cb->l2wcb_dev;
5876         ASSERT(dev != NULL);
5877         head = cb->l2wcb_head;
5878         ASSERT(head != NULL);
5879         buflist = &dev->l2ad_buflist;
5880         ASSERT(buflist != NULL);
5881         DTRACE_PROBE2(l2arc__iodone, zio_t *, zio,
5882             l2arc_write_callback_t *, cb);
5883
5884         if (zio->io_error != 0)
5885                 ARCSTAT_BUMP(arcstat_l2_writes_error);
5886
5887         /*
5888          * All writes completed, or an error was hit.
5889          */
5890 top:
5891         mutex_enter(&dev->l2ad_mtx);
5892         for (hdr = list_prev(buflist, head); hdr; hdr = hdr_prev) {
5893                 hdr_prev = list_prev(buflist, hdr);
5894
5895                 hash_lock = HDR_LOCK(hdr);
5896
5897                 /*
5898                  * We cannot use mutex_enter or else we can deadlock
5899                  * with l2arc_write_buffers (due to swapping the order
5900                  * the hash lock and l2ad_mtx are taken).
5901                  */
5902                 if (!mutex_tryenter(hash_lock)) {
5903                         /*
5904                          * Missed the hash lock. We must retry so we
5905                          * don't leave the ARC_FLAG_L2_WRITING bit set.
5906                          */
5907                         ARCSTAT_BUMP(arcstat_l2_writes_lock_retry);
5908
5909                         /*
5910                          * We don't want to rescan the headers we've
5911                          * already marked as having been written out, so
5912                          * we reinsert the head node so we can pick up
5913                          * where we left off.
5914                          */
5915                         list_remove(buflist, head);
5916                         list_insert_after(buflist, hdr, head);
5917
5918                         mutex_exit(&dev->l2ad_mtx);
5919
5920                         /*
5921                          * We wait for the hash lock to become available
5922                          * to try and prevent busy waiting, and increase
5923                          * the chance we'll be able to acquire the lock
5924                          * the next time around.
5925                          */
5926                         mutex_enter(hash_lock);
5927                         mutex_exit(hash_lock);
5928                         goto top;
5929                 }
5930
5931                 /*
5932                  * We could not have been moved into the arc_l2c_only
5933                  * state while in-flight due to our ARC_FLAG_L2_WRITING
5934                  * bit being set. Let's just ensure that's being enforced.
5935                  */
5936                 ASSERT(HDR_HAS_L1HDR(hdr));
5937
5938                 /*
5939                  * We may have allocated a buffer for L2ARC compression,
5940                  * we must release it to avoid leaking this data.
5941                  */
5942                 l2arc_release_cdata_buf(hdr);
5943
5944                 if (zio->io_error != 0) {
5945                         /*
5946                          * Error - drop L2ARC entry.
5947                          */
5948                         list_remove(buflist, hdr);
5949                         hdr->b_flags &= ~ARC_FLAG_HAS_L2HDR;
5950
5951                         ARCSTAT_INCR(arcstat_l2_asize, -hdr->b_l2hdr.b_asize);
5952                         ARCSTAT_INCR(arcstat_l2_size, -hdr->b_size);
5953
5954                         bytes_dropped += hdr->b_l2hdr.b_asize;
5955                         (void) refcount_remove_many(&dev->l2ad_alloc,
5956                             hdr->b_l2hdr.b_asize, hdr);
5957                 }
5958
5959                 /*
5960                  * Allow ARC to begin reads and ghost list evictions to
5961                  * this L2ARC entry.
5962                  */
5963                 hdr->b_flags &= ~ARC_FLAG_L2_WRITING;
5964
5965                 mutex_exit(hash_lock);
5966         }
5967
5968         atomic_inc_64(&l2arc_writes_done);
5969         list_remove(buflist, head);
5970         ASSERT(!HDR_HAS_L1HDR(head));
5971         kmem_cache_free(hdr_l2only_cache, head);
5972         mutex_exit(&dev->l2ad_mtx);
5973
5974         vdev_space_update(dev->l2ad_vdev, -bytes_dropped, 0, 0);
5975
5976         l2arc_do_free_on_write();
5977
5978         kmem_free(cb, sizeof (l2arc_write_callback_t));
5979 }
5980
5981 /*
5982  * A read to a cache device completed.  Validate buffer contents before
5983  * handing over to the regular ARC routines.
5984  */
5985 static void
5986 l2arc_read_done(zio_t *zio)
5987 {
5988         l2arc_read_callback_t *cb;
5989         arc_buf_hdr_t *hdr;
5990         arc_buf_t *buf;
5991         kmutex_t *hash_lock;
5992         int equal;
5993
5994         ASSERT(zio->io_vd != NULL);
5995         ASSERT(zio->io_flags & ZIO_FLAG_DONT_PROPAGATE);
5996
5997         spa_config_exit(zio->io_spa, SCL_L2ARC, zio->io_vd);
5998
5999         cb = zio->io_private;
6000         ASSERT(cb != NULL);
6001         buf = cb->l2rcb_buf;
6002         ASSERT(buf != NULL);
6003
6004         hash_lock = HDR_LOCK(buf->b_hdr);
6005         mutex_enter(hash_lock);
6006         hdr = buf->b_hdr;
6007         ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
6008
6009         /*
6010          * If the buffer was compressed, decompress it first.
6011          */
6012         if (cb->l2rcb_compress != ZIO_COMPRESS_OFF)
6013                 l2arc_decompress_zio(zio, hdr, cb->l2rcb_compress);
6014         ASSERT(zio->io_data != NULL);
6015
6016         /*
6017          * Check this survived the L2ARC journey.
6018          */
6019         equal = arc_cksum_equal(buf);
6020         if (equal && zio->io_error == 0 && !HDR_L2_EVICTED(hdr)) {
6021                 mutex_exit(hash_lock);
6022                 zio->io_private = buf;
6023                 zio->io_bp_copy = cb->l2rcb_bp; /* XXX fix in L2ARC 2.0 */
6024                 zio->io_bp = &zio->io_bp_copy;  /* XXX fix in L2ARC 2.0 */
6025                 arc_read_done(zio);
6026         } else {
6027                 mutex_exit(hash_lock);
6028                 /*
6029                  * Buffer didn't survive caching.  Increment stats and
6030                  * reissue to the original storage device.
6031                  */
6032                 if (zio->io_error != 0) {
6033                         ARCSTAT_BUMP(arcstat_l2_io_error);
6034                 } else {
6035                         zio->io_error = SET_ERROR(EIO);
6036                 }
6037                 if (!equal)
6038                         ARCSTAT_BUMP(arcstat_l2_cksum_bad);
6039
6040                 /*
6041                  * If there's no waiter, issue an async i/o to the primary
6042                  * storage now.  If there *is* a waiter, the caller must
6043                  * issue the i/o in a context where it's OK to block.
6044                  */
6045                 if (zio->io_waiter == NULL) {
6046                         zio_t *pio = zio_unique_parent(zio);
6047
6048                         ASSERT(!pio || pio->io_child_type == ZIO_CHILD_LOGICAL);
6049
6050                         zio_nowait(zio_read(pio, cb->l2rcb_spa, &cb->l2rcb_bp,
6051                             buf->b_data, zio->io_size, arc_read_done, buf,
6052                             zio->io_priority, cb->l2rcb_flags, &cb->l2rcb_zb));
6053                 }
6054         }
6055
6056         kmem_free(cb, sizeof (l2arc_read_callback_t));
6057 }
6058
6059 /*
6060  * This is the list priority from which the L2ARC will search for pages to
6061  * cache.  This is used within loops (0..3) to cycle through lists in the
6062  * desired order.  This order can have a significant effect on cache
6063  * performance.
6064  *
6065  * Currently the metadata lists are hit first, MFU then MRU, followed by
6066  * the data lists.  This function returns a locked list, and also returns
6067  * the lock pointer.
6068  */
6069 static multilist_sublist_t *
6070 l2arc_sublist_lock(int list_num)
6071 {
6072         multilist_t *ml = NULL;
6073         unsigned int idx;
6074
6075         ASSERT(list_num >= 0 && list_num <= 3);
6076
6077         switch (list_num) {
6078         case 0:
6079                 ml = &arc_mfu->arcs_list[ARC_BUFC_METADATA];
6080                 break;
6081         case 1:
6082                 ml = &arc_mru->arcs_list[ARC_BUFC_METADATA];
6083                 break;
6084         case 2:
6085                 ml = &arc_mfu->arcs_list[ARC_BUFC_DATA];
6086                 break;
6087         case 3:
6088                 ml = &arc_mru->arcs_list[ARC_BUFC_DATA];
6089                 break;
6090         }
6091
6092         /*
6093          * Return a randomly-selected sublist. This is acceptable
6094          * because the caller feeds only a little bit of data for each
6095          * call (8MB). Subsequent calls will result in different
6096          * sublists being selected.
6097          */
6098         idx = multilist_get_random_index(ml);
6099         return (multilist_sublist_lock(ml, idx));
6100 }
6101
6102 /*
6103  * Evict buffers from the device write hand to the distance specified in
6104  * bytes.  This distance may span populated buffers, it may span nothing.
6105  * This is clearing a region on the L2ARC device ready for writing.
6106  * If the 'all' boolean is set, every buffer is evicted.
6107  */
6108 static void
6109 l2arc_evict(l2arc_dev_t *dev, uint64_t distance, boolean_t all)
6110 {
6111         list_t *buflist;
6112         arc_buf_hdr_t *hdr, *hdr_prev;
6113         kmutex_t *hash_lock;
6114         uint64_t taddr;
6115
6116         buflist = &dev->l2ad_buflist;
6117
6118         if (!all && dev->l2ad_first) {
6119                 /*
6120                  * This is the first sweep through the device.  There is
6121                  * nothing to evict.
6122                  */
6123                 return;
6124         }
6125
6126         if (dev->l2ad_hand >= (dev->l2ad_end - (2 * distance))) {
6127                 /*
6128                  * When nearing the end of the device, evict to the end
6129                  * before the device write hand jumps to the start.
6130                  */
6131                 taddr = dev->l2ad_end;
6132         } else {
6133                 taddr = dev->l2ad_hand + distance;
6134         }
6135         DTRACE_PROBE4(l2arc__evict, l2arc_dev_t *, dev, list_t *, buflist,
6136             uint64_t, taddr, boolean_t, all);
6137
6138 top:
6139         mutex_enter(&dev->l2ad_mtx);
6140         for (hdr = list_tail(buflist); hdr; hdr = hdr_prev) {
6141                 hdr_prev = list_prev(buflist, hdr);
6142
6143                 hash_lock = HDR_LOCK(hdr);
6144
6145                 /*
6146                  * We cannot use mutex_enter or else we can deadlock
6147                  * with l2arc_write_buffers (due to swapping the order
6148                  * the hash lock and l2ad_mtx are taken).
6149                  */
6150                 if (!mutex_tryenter(hash_lock)) {
6151                         /*
6152                          * Missed the hash lock.  Retry.
6153                          */
6154                         ARCSTAT_BUMP(arcstat_l2_evict_lock_retry);
6155                         mutex_exit(&dev->l2ad_mtx);
6156                         mutex_enter(hash_lock);
6157                         mutex_exit(hash_lock);
6158                         goto top;
6159                 }
6160
6161                 if (HDR_L2_WRITE_HEAD(hdr)) {
6162                         /*
6163                          * We hit a write head node.  Leave it for
6164                          * l2arc_write_done().
6165                          */
6166                         list_remove(buflist, hdr);
6167                         mutex_exit(hash_lock);
6168                         continue;
6169                 }
6170
6171                 if (!all && HDR_HAS_L2HDR(hdr) &&
6172                     (hdr->b_l2hdr.b_daddr > taddr ||
6173                     hdr->b_l2hdr.b_daddr < dev->l2ad_hand)) {
6174                         /*
6175                          * We've evicted to the target address,
6176                          * or the end of the device.
6177                          */
6178                         mutex_exit(hash_lock);
6179                         break;
6180                 }
6181
6182                 ASSERT(HDR_HAS_L2HDR(hdr));
6183                 if (!HDR_HAS_L1HDR(hdr)) {
6184                         ASSERT(!HDR_L2_READING(hdr));
6185                         /*
6186                          * This doesn't exist in the ARC.  Destroy.
6187                          * arc_hdr_destroy() will call list_remove()
6188                          * and decrement arcstat_l2_size.
6189                          */
6190                         arc_change_state(arc_anon, hdr, hash_lock);
6191                         arc_hdr_destroy(hdr);
6192                 } else {
6193                         ASSERT(hdr->b_l1hdr.b_state != arc_l2c_only);
6194                         ARCSTAT_BUMP(arcstat_l2_evict_l1cached);
6195                         /*
6196                          * Invalidate issued or about to be issued
6197                          * reads, since we may be about to write
6198                          * over this location.
6199                          */
6200                         if (HDR_L2_READING(hdr)) {
6201                                 ARCSTAT_BUMP(arcstat_l2_evict_reading);
6202                                 hdr->b_flags |= ARC_FLAG_L2_EVICTED;
6203                         }
6204
6205                         /* Ensure this header has finished being written */
6206                         ASSERT(!HDR_L2_WRITING(hdr));
6207                         ASSERT3P(hdr->b_l1hdr.b_tmp_cdata, ==, NULL);
6208
6209                         arc_hdr_l2hdr_destroy(hdr);
6210                 }
6211                 mutex_exit(hash_lock);
6212         }
6213         mutex_exit(&dev->l2ad_mtx);
6214 }
6215
6216 /*
6217  * Find and write ARC buffers to the L2ARC device.
6218  *
6219  * An ARC_FLAG_L2_WRITING flag is set so that the L2ARC buffers are not valid
6220  * for reading until they have completed writing.
6221  * The headroom_boost is an in-out parameter used to maintain headroom boost
6222  * state between calls to this function.
6223  *
6224  * Returns the number of bytes actually written (which may be smaller than
6225  * the delta by which the device hand has changed due to alignment).
6226  */
6227 static uint64_t
6228 l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz,
6229     boolean_t *headroom_boost)
6230 {
6231         arc_buf_hdr_t *hdr, *hdr_prev, *head;
6232         uint64_t write_asize, write_sz, headroom, buf_compress_minsz,
6233             stats_size;
6234         void *buf_data;
6235         boolean_t full;
6236         l2arc_write_callback_t *cb;
6237         zio_t *pio, *wzio;
6238         uint64_t guid = spa_load_guid(spa);
6239         int try;
6240         const boolean_t do_headroom_boost = *headroom_boost;
6241
6242         ASSERT(dev->l2ad_vdev != NULL);
6243
6244         /* Lower the flag now, we might want to raise it again later. */
6245         *headroom_boost = B_FALSE;
6246
6247         pio = NULL;
6248         write_sz = write_asize = 0;
6249         full = B_FALSE;
6250         head = kmem_cache_alloc(hdr_l2only_cache, KM_PUSHPAGE);
6251         head->b_flags |= ARC_FLAG_L2_WRITE_HEAD;
6252         head->b_flags |= ARC_FLAG_HAS_L2HDR;
6253
6254         /*
6255          * We will want to try to compress buffers that are at least 2x the
6256          * device sector size.
6257          */
6258         buf_compress_minsz = 2 << dev->l2ad_vdev->vdev_ashift;
6259
6260         /*
6261          * Copy buffers for L2ARC writing.
6262          */
6263         for (try = 0; try <= 3; try++) {
6264                 multilist_sublist_t *mls = l2arc_sublist_lock(try);
6265                 uint64_t passed_sz = 0;
6266
6267                 /*
6268                  * L2ARC fast warmup.
6269                  *
6270                  * Until the ARC is warm and starts to evict, read from the
6271                  * head of the ARC lists rather than the tail.
6272                  */
6273                 if (arc_warm == B_FALSE)
6274                         hdr = multilist_sublist_head(mls);
6275                 else
6276                         hdr = multilist_sublist_tail(mls);
6277
6278                 headroom = target_sz * l2arc_headroom;
6279                 if (do_headroom_boost)
6280                         headroom = (headroom * l2arc_headroom_boost) / 100;
6281
6282                 for (; hdr; hdr = hdr_prev) {
6283                         kmutex_t *hash_lock;
6284                         uint64_t buf_sz;
6285                         uint64_t buf_a_sz;
6286
6287                         if (arc_warm == B_FALSE)
6288                                 hdr_prev = multilist_sublist_next(mls, hdr);
6289                         else
6290                                 hdr_prev = multilist_sublist_prev(mls, hdr);
6291
6292                         hash_lock = HDR_LOCK(hdr);
6293                         if (!mutex_tryenter(hash_lock)) {
6294                                 /*
6295                                  * Skip this buffer rather than waiting.
6296                                  */
6297                                 continue;
6298                         }
6299
6300                         passed_sz += hdr->b_size;
6301                         if (passed_sz > headroom) {
6302                                 /*
6303                                  * Searched too far.
6304                                  */
6305                                 mutex_exit(hash_lock);
6306                                 break;
6307                         }
6308
6309                         if (!l2arc_write_eligible(guid, hdr)) {
6310                                 mutex_exit(hash_lock);
6311                                 continue;
6312                         }
6313
6314                         /*
6315                          * Assume that the buffer is not going to be compressed
6316                          * and could take more space on disk because of a larger
6317                          * disk block size.
6318                          */
6319                         buf_sz = hdr->b_size;
6320                         buf_a_sz = vdev_psize_to_asize(dev->l2ad_vdev, buf_sz);
6321
6322                         if ((write_asize + buf_a_sz) > target_sz) {
6323                                 full = B_TRUE;
6324                                 mutex_exit(hash_lock);
6325                                 break;
6326                         }
6327
6328                         if (pio == NULL) {
6329                                 /*
6330                                  * Insert a dummy header on the buflist so
6331                                  * l2arc_write_done() can find where the
6332                                  * write buffers begin without searching.
6333                                  */
6334                                 mutex_enter(&dev->l2ad_mtx);
6335                                 list_insert_head(&dev->l2ad_buflist, head);
6336                                 mutex_exit(&dev->l2ad_mtx);
6337
6338                                 cb = kmem_alloc(sizeof (l2arc_write_callback_t),
6339                                     KM_SLEEP);
6340                                 cb->l2wcb_dev = dev;
6341                                 cb->l2wcb_head = head;
6342                                 pio = zio_root(spa, l2arc_write_done, cb,
6343                                     ZIO_FLAG_CANFAIL);
6344                         }
6345
6346                         /*
6347                          * Create and add a new L2ARC header.
6348                          */
6349                         hdr->b_l2hdr.b_dev = dev;
6350                         hdr->b_flags |= ARC_FLAG_L2_WRITING;
6351                         /*
6352                          * Temporarily stash the data buffer in b_tmp_cdata.
6353                          * The subsequent write step will pick it up from
6354                          * there. This is because can't access b_l1hdr.b_buf
6355                          * without holding the hash_lock, which we in turn
6356                          * can't access without holding the ARC list locks
6357                          * (which we want to avoid during compression/writing)
6358                          */
6359                         HDR_SET_COMPRESS(hdr, ZIO_COMPRESS_OFF);
6360                         hdr->b_l2hdr.b_asize = hdr->b_size;
6361                         hdr->b_l2hdr.b_hits = 0;
6362                         hdr->b_l1hdr.b_tmp_cdata = hdr->b_l1hdr.b_buf->b_data;
6363
6364                         /*
6365                          * Explicitly set the b_daddr field to a known
6366                          * value which means "invalid address". This
6367                          * enables us to differentiate which stage of
6368                          * l2arc_write_buffers() the particular header
6369                          * is in (e.g. this loop, or the one below).
6370                          * ARC_FLAG_L2_WRITING is not enough to make
6371                          * this distinction, and we need to know in
6372                          * order to do proper l2arc vdev accounting in
6373                          * arc_release() and arc_hdr_destroy().
6374                          *
6375                          * Note, we can't use a new flag to distinguish
6376                          * the two stages because we don't hold the
6377                          * header's hash_lock below, in the second stage
6378                          * of this function. Thus, we can't simply
6379                          * change the b_flags field to denote that the
6380                          * IO has been sent. We can change the b_daddr
6381                          * field of the L2 portion, though, since we'll
6382                          * be holding the l2ad_mtx; which is why we're
6383                          * using it to denote the header's state change.
6384                          */
6385                         hdr->b_l2hdr.b_daddr = L2ARC_ADDR_UNSET;
6386                         hdr->b_flags |= ARC_FLAG_HAS_L2HDR;
6387
6388                         mutex_enter(&dev->l2ad_mtx);
6389                         list_insert_head(&dev->l2ad_buflist, hdr);
6390                         mutex_exit(&dev->l2ad_mtx);
6391
6392                         /*
6393                          * Compute and store the buffer cksum before
6394                          * writing.  On debug the cksum is verified first.
6395                          */
6396                         arc_cksum_verify(hdr->b_l1hdr.b_buf);
6397                         arc_cksum_compute(hdr->b_l1hdr.b_buf, B_TRUE);
6398
6399                         mutex_exit(hash_lock);
6400
6401                         write_sz += buf_sz;
6402                         write_asize += buf_a_sz;
6403                 }
6404
6405                 multilist_sublist_unlock(mls);
6406
6407                 if (full == B_TRUE)
6408                         break;
6409         }
6410
6411         /* No buffers selected for writing? */
6412         if (pio == NULL) {
6413                 ASSERT0(write_sz);
6414                 ASSERT(!HDR_HAS_L1HDR(head));
6415                 kmem_cache_free(hdr_l2only_cache, head);
6416                 return (0);
6417         }
6418
6419         mutex_enter(&dev->l2ad_mtx);
6420
6421         /*
6422          * Note that elsewhere in this file arcstat_l2_asize
6423          * and the used space on l2ad_vdev are updated using b_asize,
6424          * which is not necessarily rounded up to the device block size.
6425          * Too keep accounting consistent we do the same here as well:
6426          * stats_size accumulates the sum of b_asize of the written buffers,
6427          * while write_asize accumulates the sum of b_asize rounded up
6428          * to the device block size.
6429          * The latter sum is used only to validate the corectness of the code.
6430          */
6431         stats_size = 0;
6432         write_asize = 0;
6433
6434         /*
6435          * Now start writing the buffers. We're starting at the write head
6436          * and work backwards, retracing the course of the buffer selector
6437          * loop above.
6438          */
6439         for (hdr = list_prev(&dev->l2ad_buflist, head); hdr;
6440             hdr = list_prev(&dev->l2ad_buflist, hdr)) {
6441                 uint64_t buf_sz;
6442
6443                 /*
6444                  * We rely on the L1 portion of the header below, so
6445                  * it's invalid for this header to have been evicted out
6446                  * of the ghost cache, prior to being written out. The
6447                  * ARC_FLAG_L2_WRITING bit ensures this won't happen.
6448                  */
6449                 ASSERT(HDR_HAS_L1HDR(hdr));
6450
6451                 /*
6452                  * We shouldn't need to lock the buffer here, since we flagged
6453                  * it as ARC_FLAG_L2_WRITING in the previous step, but we must
6454                  * take care to only access its L2 cache parameters. In
6455                  * particular, hdr->l1hdr.b_buf may be invalid by now due to
6456                  * ARC eviction.
6457                  */
6458                 hdr->b_l2hdr.b_daddr = dev->l2ad_hand;
6459
6460                 if ((!l2arc_nocompress && HDR_L2COMPRESS(hdr)) &&
6461                     hdr->b_l2hdr.b_asize >= buf_compress_minsz) {
6462                         if (l2arc_compress_buf(hdr)) {
6463                                 /*
6464                                  * If compression succeeded, enable headroom
6465                                  * boost on the next scan cycle.
6466                                  */
6467                                 *headroom_boost = B_TRUE;
6468                         }
6469                 }
6470
6471                 /*
6472                  * Pick up the buffer data we had previously stashed away
6473                  * (and now potentially also compressed).
6474                  */
6475                 buf_data = hdr->b_l1hdr.b_tmp_cdata;
6476                 buf_sz = hdr->b_l2hdr.b_asize;
6477
6478                 /*
6479                  * We need to do this regardless if buf_sz is zero or
6480                  * not, otherwise, when this l2hdr is evicted we'll
6481                  * remove a reference that was never added.
6482                  */
6483                 (void) refcount_add_many(&dev->l2ad_alloc, buf_sz, hdr);
6484
6485                 /* Compression may have squashed the buffer to zero length. */
6486                 if (buf_sz != 0) {
6487                         uint64_t buf_a_sz;
6488
6489                         wzio = zio_write_phys(pio, dev->l2ad_vdev,
6490                             dev->l2ad_hand, buf_sz, buf_data, ZIO_CHECKSUM_OFF,
6491                             NULL, NULL, ZIO_PRIORITY_ASYNC_WRITE,
6492                             ZIO_FLAG_CANFAIL, B_FALSE);
6493
6494                         DTRACE_PROBE2(l2arc__write, vdev_t *, dev->l2ad_vdev,
6495                             zio_t *, wzio);
6496                         (void) zio_nowait(wzio);
6497
6498                         stats_size += buf_sz;
6499
6500                         /*
6501                          * Keep the clock hand suitably device-aligned.
6502                          */
6503                         buf_a_sz = vdev_psize_to_asize(dev->l2ad_vdev, buf_sz);
6504                         write_asize += buf_a_sz;
6505                         dev->l2ad_hand += buf_a_sz;
6506                 }
6507         }
6508
6509         mutex_exit(&dev->l2ad_mtx);
6510
6511         ASSERT3U(write_asize, <=, target_sz);
6512         ARCSTAT_BUMP(arcstat_l2_writes_sent);
6513         ARCSTAT_INCR(arcstat_l2_write_bytes, write_asize);
6514         ARCSTAT_INCR(arcstat_l2_size, write_sz);
6515         ARCSTAT_INCR(arcstat_l2_asize, stats_size);
6516         vdev_space_update(dev->l2ad_vdev, stats_size, 0, 0);
6517
6518         /*
6519          * Bump device hand to the device start if it is approaching the end.
6520          * l2arc_evict() will already have evicted ahead for this case.
6521          */
6522         if (dev->l2ad_hand >= (dev->l2ad_end - target_sz)) {
6523                 dev->l2ad_hand = dev->l2ad_start;
6524                 dev->l2ad_first = B_FALSE;
6525         }
6526
6527         dev->l2ad_writing = B_TRUE;
6528         (void) zio_wait(pio);
6529         dev->l2ad_writing = B_FALSE;
6530
6531         return (write_asize);
6532 }
6533
6534 /*
6535  * Compresses an L2ARC buffer.
6536  * The data to be compressed must be prefilled in l1hdr.b_tmp_cdata and its
6537  * size in l2hdr->b_asize. This routine tries to compress the data and
6538  * depending on the compression result there are three possible outcomes:
6539  * *) The buffer was incompressible. The original l2hdr contents were left
6540  *    untouched and are ready for writing to an L2 device.
6541  * *) The buffer was all-zeros, so there is no need to write it to an L2
6542  *    device. To indicate this situation b_tmp_cdata is NULL'ed, b_asize is
6543  *    set to zero and b_compress is set to ZIO_COMPRESS_EMPTY.
6544  * *) Compression succeeded and b_tmp_cdata was replaced with a temporary
6545  *    data buffer which holds the compressed data to be written, and b_asize
6546  *    tells us how much data there is. b_compress is set to the appropriate
6547  *    compression algorithm. Once writing is done, invoke
6548  *    l2arc_release_cdata_buf on this l2hdr to free this temporary buffer.
6549  *
6550  * Returns B_TRUE if compression succeeded, or B_FALSE if it didn't (the
6551  * buffer was incompressible).
6552  */
6553 static boolean_t
6554 l2arc_compress_buf(arc_buf_hdr_t *hdr)
6555 {
6556         void *cdata;
6557         size_t csize, len, rounded;
6558         l2arc_buf_hdr_t *l2hdr;
6559
6560         ASSERT(HDR_HAS_L2HDR(hdr));
6561
6562         l2hdr = &hdr->b_l2hdr;
6563
6564         ASSERT(HDR_HAS_L1HDR(hdr));
6565         ASSERT(HDR_GET_COMPRESS(hdr) == ZIO_COMPRESS_OFF);
6566         ASSERT(hdr->b_l1hdr.b_tmp_cdata != NULL);
6567
6568         len = l2hdr->b_asize;
6569         cdata = zio_data_buf_alloc(len);
6570         ASSERT3P(cdata, !=, NULL);
6571         csize = zio_compress_data(ZIO_COMPRESS_LZ4, hdr->b_l1hdr.b_tmp_cdata,
6572             cdata, l2hdr->b_asize);
6573
6574         rounded = P2ROUNDUP(csize, (size_t)SPA_MINBLOCKSIZE);
6575         if (rounded > csize) {
6576                 bzero((char *)cdata + csize, rounded - csize);
6577                 csize = rounded;
6578         }
6579
6580         if (csize == 0) {
6581                 /* zero block, indicate that there's nothing to write */
6582                 zio_data_buf_free(cdata, len);
6583                 HDR_SET_COMPRESS(hdr, ZIO_COMPRESS_EMPTY);
6584                 l2hdr->b_asize = 0;
6585                 hdr->b_l1hdr.b_tmp_cdata = NULL;
6586                 ARCSTAT_BUMP(arcstat_l2_compress_zeros);
6587                 return (B_TRUE);
6588         } else if (csize > 0 && csize < len) {
6589                 /*
6590                  * Compression succeeded, we'll keep the cdata around for
6591                  * writing and release it afterwards.
6592                  */
6593                 HDR_SET_COMPRESS(hdr, ZIO_COMPRESS_LZ4);
6594                 l2hdr->b_asize = csize;
6595                 hdr->b_l1hdr.b_tmp_cdata = cdata;
6596                 ARCSTAT_BUMP(arcstat_l2_compress_successes);
6597                 return (B_TRUE);
6598         } else {
6599                 /*
6600                  * Compression failed, release the compressed buffer.
6601                  * l2hdr will be left unmodified.
6602                  */
6603                 zio_data_buf_free(cdata, len);
6604                 ARCSTAT_BUMP(arcstat_l2_compress_failures);
6605                 return (B_FALSE);
6606         }
6607 }
6608
6609 /*
6610  * Decompresses a zio read back from an l2arc device. On success, the
6611  * underlying zio's io_data buffer is overwritten by the uncompressed
6612  * version. On decompression error (corrupt compressed stream), the
6613  * zio->io_error value is set to signal an I/O error.
6614  *
6615  * Please note that the compressed data stream is not checksummed, so
6616  * if the underlying device is experiencing data corruption, we may feed
6617  * corrupt data to the decompressor, so the decompressor needs to be
6618  * able to handle this situation (LZ4 does).
6619  */
6620 static void
6621 l2arc_decompress_zio(zio_t *zio, arc_buf_hdr_t *hdr, enum zio_compress c)
6622 {
6623         uint64_t csize;
6624         void *cdata;
6625
6626         ASSERT(L2ARC_IS_VALID_COMPRESS(c));
6627
6628         if (zio->io_error != 0) {
6629                 /*
6630                  * An io error has occured, just restore the original io
6631                  * size in preparation for a main pool read.
6632                  */
6633                 zio->io_orig_size = zio->io_size = hdr->b_size;
6634                 return;
6635         }
6636
6637         if (c == ZIO_COMPRESS_EMPTY) {
6638                 /*
6639                  * An empty buffer results in a null zio, which means we
6640                  * need to fill its io_data after we're done restoring the
6641                  * buffer's contents.
6642                  */
6643                 ASSERT(hdr->b_l1hdr.b_buf != NULL);
6644                 bzero(hdr->b_l1hdr.b_buf->b_data, hdr->b_size);
6645                 zio->io_data = zio->io_orig_data = hdr->b_l1hdr.b_buf->b_data;
6646         } else {
6647                 ASSERT(zio->io_data != NULL);
6648                 /*
6649                  * We copy the compressed data from the start of the arc buffer
6650                  * (the zio_read will have pulled in only what we need, the
6651                  * rest is garbage which we will overwrite at decompression)
6652                  * and then decompress back to the ARC data buffer. This way we
6653                  * can minimize copying by simply decompressing back over the
6654                  * original compressed data (rather than decompressing to an
6655                  * aux buffer and then copying back the uncompressed buffer,
6656                  * which is likely to be much larger).
6657                  */
6658                 csize = zio->io_size;
6659                 cdata = zio_data_buf_alloc(csize);
6660                 bcopy(zio->io_data, cdata, csize);
6661                 if (zio_decompress_data(c, cdata, zio->io_data, csize,
6662                     hdr->b_size) != 0)
6663                         zio->io_error = SET_ERROR(EIO);
6664                 zio_data_buf_free(cdata, csize);
6665         }
6666
6667         /* Restore the expected uncompressed IO size. */
6668         zio->io_orig_size = zio->io_size = hdr->b_size;
6669 }
6670
6671 /*
6672  * Releases the temporary b_tmp_cdata buffer in an l2arc header structure.
6673  * This buffer serves as a temporary holder of compressed data while
6674  * the buffer entry is being written to an l2arc device. Once that is
6675  * done, we can dispose of it.
6676  */
6677 static void
6678 l2arc_release_cdata_buf(arc_buf_hdr_t *hdr)
6679 {
6680         enum zio_compress comp = HDR_GET_COMPRESS(hdr);
6681
6682         ASSERT(HDR_HAS_L1HDR(hdr));
6683         ASSERT(comp == ZIO_COMPRESS_OFF || L2ARC_IS_VALID_COMPRESS(comp));
6684
6685         if (comp == ZIO_COMPRESS_OFF) {
6686                 /*
6687                  * In this case, b_tmp_cdata points to the same buffer
6688                  * as the arc_buf_t's b_data field. We don't want to
6689                  * free it, since the arc_buf_t will handle that.
6690                  */
6691                 hdr->b_l1hdr.b_tmp_cdata = NULL;
6692         } else if (comp == ZIO_COMPRESS_EMPTY) {
6693                 /*
6694                  * In this case, b_tmp_cdata was compressed to an empty
6695                  * buffer, thus there's nothing to free and b_tmp_cdata
6696                  * should have been set to NULL in l2arc_write_buffers().
6697                  */
6698                 ASSERT3P(hdr->b_l1hdr.b_tmp_cdata, ==, NULL);
6699         } else {
6700                 /*
6701                  * If the data was compressed, then we've allocated a
6702                  * temporary buffer for it, so now we need to release it.
6703                  */
6704                 ASSERT(hdr->b_l1hdr.b_tmp_cdata != NULL);
6705                 zio_data_buf_free(hdr->b_l1hdr.b_tmp_cdata,
6706                     hdr->b_size);
6707                 hdr->b_l1hdr.b_tmp_cdata = NULL;
6708         }
6709
6710 }
6711
6712 /*
6713  * This thread feeds the L2ARC at regular intervals.  This is the beating
6714  * heart of the L2ARC.
6715  */
6716 static void
6717 l2arc_feed_thread(void)
6718 {
6719         callb_cpr_t cpr;
6720         l2arc_dev_t *dev;
6721         spa_t *spa;
6722         uint64_t size, wrote;
6723         clock_t begin, next = ddi_get_lbolt();
6724         boolean_t headroom_boost = B_FALSE;
6725         fstrans_cookie_t cookie;
6726
6727         CALLB_CPR_INIT(&cpr, &l2arc_feed_thr_lock, callb_generic_cpr, FTAG);
6728
6729         mutex_enter(&l2arc_feed_thr_lock);
6730
6731         cookie = spl_fstrans_mark();
6732         while (l2arc_thread_exit == 0) {
6733                 CALLB_CPR_SAFE_BEGIN(&cpr);
6734                 (void) cv_timedwait_sig(&l2arc_feed_thr_cv,
6735                     &l2arc_feed_thr_lock, next);
6736                 CALLB_CPR_SAFE_END(&cpr, &l2arc_feed_thr_lock);
6737                 next = ddi_get_lbolt() + hz;
6738
6739                 /*
6740                  * Quick check for L2ARC devices.
6741                  */
6742                 mutex_enter(&l2arc_dev_mtx);
6743                 if (l2arc_ndev == 0) {
6744                         mutex_exit(&l2arc_dev_mtx);
6745                         continue;
6746                 }
6747                 mutex_exit(&l2arc_dev_mtx);
6748                 begin = ddi_get_lbolt();
6749
6750                 /*
6751                  * This selects the next l2arc device to write to, and in
6752                  * doing so the next spa to feed from: dev->l2ad_spa.   This
6753                  * will return NULL if there are now no l2arc devices or if
6754                  * they are all faulted.
6755                  *
6756                  * If a device is returned, its spa's config lock is also
6757                  * held to prevent device removal.  l2arc_dev_get_next()
6758                  * will grab and release l2arc_dev_mtx.
6759                  */
6760                 if ((dev = l2arc_dev_get_next()) == NULL)
6761                         continue;
6762
6763                 spa = dev->l2ad_spa;
6764                 ASSERT(spa != NULL);
6765
6766                 /*
6767                  * If the pool is read-only then force the feed thread to
6768                  * sleep a little longer.
6769                  */
6770                 if (!spa_writeable(spa)) {
6771                         next = ddi_get_lbolt() + 5 * l2arc_feed_secs * hz;
6772                         spa_config_exit(spa, SCL_L2ARC, dev);
6773                         continue;
6774                 }
6775
6776                 /*
6777                  * Avoid contributing to memory pressure.
6778                  */
6779                 if (arc_reclaim_needed()) {
6780                         ARCSTAT_BUMP(arcstat_l2_abort_lowmem);
6781                         spa_config_exit(spa, SCL_L2ARC, dev);
6782                         continue;
6783                 }
6784
6785                 ARCSTAT_BUMP(arcstat_l2_feeds);
6786
6787                 size = l2arc_write_size();
6788
6789                 /*
6790                  * Evict L2ARC buffers that will be overwritten.
6791                  */
6792                 l2arc_evict(dev, size, B_FALSE);
6793
6794                 /*
6795                  * Write ARC buffers.
6796                  */
6797                 wrote = l2arc_write_buffers(spa, dev, size, &headroom_boost);
6798
6799                 /*
6800                  * Calculate interval between writes.
6801                  */
6802                 next = l2arc_write_interval(begin, size, wrote);
6803                 spa_config_exit(spa, SCL_L2ARC, dev);
6804         }
6805         spl_fstrans_unmark(cookie);
6806
6807         l2arc_thread_exit = 0;
6808         cv_broadcast(&l2arc_feed_thr_cv);
6809         CALLB_CPR_EXIT(&cpr);           /* drops l2arc_feed_thr_lock */
6810         thread_exit();
6811 }
6812
6813 boolean_t
6814 l2arc_vdev_present(vdev_t *vd)
6815 {
6816         l2arc_dev_t *dev;
6817
6818         mutex_enter(&l2arc_dev_mtx);
6819         for (dev = list_head(l2arc_dev_list); dev != NULL;
6820             dev = list_next(l2arc_dev_list, dev)) {
6821                 if (dev->l2ad_vdev == vd)
6822                         break;
6823         }
6824         mutex_exit(&l2arc_dev_mtx);
6825
6826         return (dev != NULL);
6827 }
6828
6829 /*
6830  * Add a vdev for use by the L2ARC.  By this point the spa has already
6831  * validated the vdev and opened it.
6832  */
6833 void
6834 l2arc_add_vdev(spa_t *spa, vdev_t *vd)
6835 {
6836         l2arc_dev_t *adddev;
6837
6838         ASSERT(!l2arc_vdev_present(vd));
6839
6840         /*
6841          * Create a new l2arc device entry.
6842          */
6843         adddev = kmem_zalloc(sizeof (l2arc_dev_t), KM_SLEEP);
6844         adddev->l2ad_spa = spa;
6845         adddev->l2ad_vdev = vd;
6846         adddev->l2ad_start = VDEV_LABEL_START_SIZE;
6847         adddev->l2ad_end = VDEV_LABEL_START_SIZE + vdev_get_min_asize(vd);
6848         adddev->l2ad_hand = adddev->l2ad_start;
6849         adddev->l2ad_first = B_TRUE;
6850         adddev->l2ad_writing = B_FALSE;
6851         list_link_init(&adddev->l2ad_node);
6852
6853         mutex_init(&adddev->l2ad_mtx, NULL, MUTEX_DEFAULT, NULL);
6854         /*
6855          * This is a list of all ARC buffers that are still valid on the
6856          * device.
6857          */
6858         list_create(&adddev->l2ad_buflist, sizeof (arc_buf_hdr_t),
6859             offsetof(arc_buf_hdr_t, b_l2hdr.b_l2node));
6860
6861         vdev_space_update(vd, 0, 0, adddev->l2ad_end - adddev->l2ad_hand);
6862         refcount_create(&adddev->l2ad_alloc);
6863
6864         /*
6865          * Add device to global list
6866          */
6867         mutex_enter(&l2arc_dev_mtx);
6868         list_insert_head(l2arc_dev_list, adddev);
6869         atomic_inc_64(&l2arc_ndev);
6870         mutex_exit(&l2arc_dev_mtx);
6871 }
6872
6873 /*
6874  * Remove a vdev from the L2ARC.
6875  */
6876 void
6877 l2arc_remove_vdev(vdev_t *vd)
6878 {
6879         l2arc_dev_t *dev, *nextdev, *remdev = NULL;
6880
6881         /*
6882          * Find the device by vdev
6883          */
6884         mutex_enter(&l2arc_dev_mtx);
6885         for (dev = list_head(l2arc_dev_list); dev; dev = nextdev) {
6886                 nextdev = list_next(l2arc_dev_list, dev);
6887                 if (vd == dev->l2ad_vdev) {
6888                         remdev = dev;
6889                         break;
6890                 }
6891         }
6892         ASSERT(remdev != NULL);
6893
6894         /*
6895          * Remove device from global list
6896          */
6897         list_remove(l2arc_dev_list, remdev);
6898         l2arc_dev_last = NULL;          /* may have been invalidated */
6899         atomic_dec_64(&l2arc_ndev);
6900         mutex_exit(&l2arc_dev_mtx);
6901
6902         /*
6903          * Clear all buflists and ARC references.  L2ARC device flush.
6904          */
6905         l2arc_evict(remdev, 0, B_TRUE);
6906         list_destroy(&remdev->l2ad_buflist);
6907         mutex_destroy(&remdev->l2ad_mtx);
6908         refcount_destroy(&remdev->l2ad_alloc);
6909         kmem_free(remdev, sizeof (l2arc_dev_t));
6910 }
6911
6912 void
6913 l2arc_init(void)
6914 {
6915         l2arc_thread_exit = 0;
6916         l2arc_ndev = 0;
6917         l2arc_writes_sent = 0;
6918         l2arc_writes_done = 0;
6919
6920         mutex_init(&l2arc_feed_thr_lock, NULL, MUTEX_DEFAULT, NULL);
6921         cv_init(&l2arc_feed_thr_cv, NULL, CV_DEFAULT, NULL);
6922         mutex_init(&l2arc_dev_mtx, NULL, MUTEX_DEFAULT, NULL);
6923         mutex_init(&l2arc_free_on_write_mtx, NULL, MUTEX_DEFAULT, NULL);
6924
6925         l2arc_dev_list = &L2ARC_dev_list;
6926         l2arc_free_on_write = &L2ARC_free_on_write;
6927         list_create(l2arc_dev_list, sizeof (l2arc_dev_t),
6928             offsetof(l2arc_dev_t, l2ad_node));
6929         list_create(l2arc_free_on_write, sizeof (l2arc_data_free_t),
6930             offsetof(l2arc_data_free_t, l2df_list_node));
6931 }
6932
6933 void
6934 l2arc_fini(void)
6935 {
6936         /*
6937          * This is called from dmu_fini(), which is called from spa_fini();
6938          * Because of this, we can assume that all l2arc devices have
6939          * already been removed when the pools themselves were removed.
6940          */
6941
6942         l2arc_do_free_on_write();
6943
6944         mutex_destroy(&l2arc_feed_thr_lock);
6945         cv_destroy(&l2arc_feed_thr_cv);
6946         mutex_destroy(&l2arc_dev_mtx);
6947         mutex_destroy(&l2arc_free_on_write_mtx);
6948
6949         list_destroy(l2arc_dev_list);
6950         list_destroy(l2arc_free_on_write);
6951 }
6952
6953 void
6954 l2arc_start(void)
6955 {
6956         if (!(spa_mode_global & FWRITE))
6957                 return;
6958
6959         (void) thread_create(NULL, 0, l2arc_feed_thread, NULL, 0, &p0,
6960             TS_RUN, minclsyspri);
6961 }
6962
6963 void
6964 l2arc_stop(void)
6965 {
6966         if (!(spa_mode_global & FWRITE))
6967                 return;
6968
6969         mutex_enter(&l2arc_feed_thr_lock);
6970         cv_signal(&l2arc_feed_thr_cv);  /* kick thread out of startup */
6971         l2arc_thread_exit = 1;
6972         while (l2arc_thread_exit != 0)
6973                 cv_wait(&l2arc_feed_thr_cv, &l2arc_feed_thr_lock);
6974         mutex_exit(&l2arc_feed_thr_lock);
6975 }
6976
6977 #if defined(_KERNEL) && defined(HAVE_SPL)
6978 EXPORT_SYMBOL(arc_buf_size);
6979 EXPORT_SYMBOL(arc_write);
6980 EXPORT_SYMBOL(arc_read);
6981 EXPORT_SYMBOL(arc_buf_remove_ref);
6982 EXPORT_SYMBOL(arc_buf_info);
6983 EXPORT_SYMBOL(arc_getbuf_func);
6984 EXPORT_SYMBOL(arc_add_prune_callback);
6985 EXPORT_SYMBOL(arc_remove_prune_callback);
6986
6987 module_param(zfs_arc_min, ulong, 0644);
6988 MODULE_PARM_DESC(zfs_arc_min, "Min arc size");
6989
6990 module_param(zfs_arc_max, ulong, 0644);
6991 MODULE_PARM_DESC(zfs_arc_max, "Max arc size");
6992
6993 module_param(zfs_arc_meta_limit, ulong, 0644);
6994 MODULE_PARM_DESC(zfs_arc_meta_limit, "Meta limit for arc size");
6995
6996 module_param(zfs_arc_meta_min, ulong, 0644);
6997 MODULE_PARM_DESC(zfs_arc_meta_min, "Min arc metadata");
6998
6999 module_param(zfs_arc_meta_prune, int, 0644);
7000 MODULE_PARM_DESC(zfs_arc_meta_prune, "Meta objects to scan for prune");
7001
7002 module_param(zfs_arc_meta_adjust_restarts, int, 0644);
7003 MODULE_PARM_DESC(zfs_arc_meta_adjust_restarts,
7004         "Limit number of restarts in arc_adjust_meta");
7005
7006 module_param(zfs_arc_meta_strategy, int, 0644);
7007 MODULE_PARM_DESC(zfs_arc_meta_strategy, "Meta reclaim strategy");
7008
7009 module_param(zfs_arc_grow_retry, int, 0644);
7010 MODULE_PARM_DESC(zfs_arc_grow_retry, "Seconds before growing arc size");
7011
7012 module_param(zfs_arc_p_aggressive_disable, int, 0644);
7013 MODULE_PARM_DESC(zfs_arc_p_aggressive_disable, "disable aggressive arc_p grow");
7014
7015 module_param(zfs_arc_p_dampener_disable, int, 0644);
7016 MODULE_PARM_DESC(zfs_arc_p_dampener_disable, "disable arc_p adapt dampener");
7017
7018 module_param(zfs_arc_shrink_shift, int, 0644);
7019 MODULE_PARM_DESC(zfs_arc_shrink_shift, "log2(fraction of arc to reclaim)");
7020
7021 module_param(zfs_arc_p_min_shift, int, 0644);
7022 MODULE_PARM_DESC(zfs_arc_p_min_shift, "arc_c shift to calc min/max arc_p");
7023
7024 module_param(zfs_disable_dup_eviction, int, 0644);
7025 MODULE_PARM_DESC(zfs_disable_dup_eviction, "disable duplicate buffer eviction");
7026
7027 module_param(zfs_arc_average_blocksize, int, 0444);
7028 MODULE_PARM_DESC(zfs_arc_average_blocksize, "Target average block size");
7029
7030 module_param(zfs_arc_memory_throttle_disable, int, 0644);
7031 MODULE_PARM_DESC(zfs_arc_memory_throttle_disable, "disable memory throttle");
7032
7033 module_param(zfs_arc_min_prefetch_lifespan, int, 0644);
7034 MODULE_PARM_DESC(zfs_arc_min_prefetch_lifespan, "Min life of prefetch block");
7035
7036 module_param(zfs_arc_num_sublists_per_state, int, 0644);
7037 MODULE_PARM_DESC(zfs_arc_num_sublists_per_state,
7038         "Number of sublists used in each of the ARC state lists");
7039
7040 module_param(l2arc_write_max, ulong, 0644);
7041 MODULE_PARM_DESC(l2arc_write_max, "Max write bytes per interval");
7042
7043 module_param(l2arc_write_boost, ulong, 0644);
7044 MODULE_PARM_DESC(l2arc_write_boost, "Extra write bytes during device warmup");
7045
7046 module_param(l2arc_headroom, ulong, 0644);
7047 MODULE_PARM_DESC(l2arc_headroom, "Number of max device writes to precache");
7048
7049 module_param(l2arc_headroom_boost, ulong, 0644);
7050 MODULE_PARM_DESC(l2arc_headroom_boost, "Compressed l2arc_headroom multiplier");
7051
7052 module_param(l2arc_feed_secs, ulong, 0644);
7053 MODULE_PARM_DESC(l2arc_feed_secs, "Seconds between L2ARC writing");
7054
7055 module_param(l2arc_feed_min_ms, ulong, 0644);
7056 MODULE_PARM_DESC(l2arc_feed_min_ms, "Min feed interval in milliseconds");
7057
7058 module_param(l2arc_noprefetch, int, 0644);
7059 MODULE_PARM_DESC(l2arc_noprefetch, "Skip caching prefetched buffers");
7060
7061 module_param(l2arc_nocompress, int, 0644);
7062 MODULE_PARM_DESC(l2arc_nocompress, "Skip compressing L2ARC buffers");
7063
7064 module_param(l2arc_feed_again, int, 0644);
7065 MODULE_PARM_DESC(l2arc_feed_again, "Turbo L2ARC warmup");
7066
7067 module_param(l2arc_norw, int, 0644);
7068 MODULE_PARM_DESC(l2arc_norw, "No reads during writes");
7069
7070 #endif