module/zfs/arc.c

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  23  * Copyright (c) 2011, 2014 by Delphix. All rights reserved.
  24  * Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
  25  * Copyright 2014 Nexenta Systems, Inc.  All rights reserved.
  26  */
  27
  28 /*
  29  * DVA-based Adjustable Replacement Cache
  30  *
  31  * While much of the theory of operation used here is
  32  * based on the self-tuning, low overhead replacement cache
  33  * presented by Megiddo and Modha at FAST 2003, there are some
  34  * significant differences:
  35  *
  36  * 1. The Megiddo and Modha model assumes any page is evictable.
  37  * Pages in its cache cannot be "locked" into memory.  This makes
  38  * the eviction algorithm simple: evict the last page in the list.
  39  * This also make the performance characteristics easy to reason
  40  * about.  Our cache is not so simple.  At any given moment, some
  41  * subset of the blocks in the cache are un-evictable because we
  42  * have handed out a reference to them.  Blocks are only evictable
  43  * when there are no external references active.  This makes
  44  * eviction far more problematic:  we choose to evict the evictable
  45  * blocks that are the "lowest" in the list.
  46  *
  47  * There are times when it is not possible to evict the requested
  48  * space.  In these circumstances we are unable to adjust the cache
  49  * size.  To prevent the cache growing unbounded at these times we
  50  * implement a "cache throttle" that slows the flow of new data
  51  * into the cache until we can make space available.
  52  *
  53  * 2. The Megiddo and Modha model assumes a fixed cache size.
  54  * Pages are evicted when the cache is full and there is a cache
  55  * miss.  Our model has a variable sized cache.  It grows with
  56  * high use, but also tries to react to memory pressure from the
  57  * operating system: decreasing its size when system memory is
  58  * tight.
  59  *
  60  * 3. The Megiddo and Modha model assumes a fixed page size. All
  61  * elements of the cache are therefore exactly the same size.  So
  62  * when adjusting the cache size following a cache miss, its simply
  63  * a matter of choosing a single page to evict.  In our model, we
  64  * have variable sized cache blocks (rangeing from 512 bytes to
  65  * 128K bytes).  We therefore choose a set of blocks to evict to make
  66  * space for a cache miss that approximates as closely as possible
  67  * the space used by the new block.
  68  *
  69  * See also:  "ARC: A Self-Tuning, Low Overhead Replacement Cache"
  70  * by N. Megiddo & D. Modha, FAST 2003
  71  */
  72
  73 /*
  74  * The locking model:
  75  *
  76  * A new reference to a cache buffer can be obtained in two
  77  * ways: 1) via a hash table lookup using the DVA as a key,
  78  * or 2) via one of the ARC lists.  The arc_read() interface
  79  * uses method 1, while the internal arc algorithms for
  80  * adjusting the cache use method 2.  We therefore provide two
  81  * types of locks: 1) the hash table lock array, and 2) the
  82  * arc list locks.
  83  *
  84  * Buffers do not have their own mutexes, rather they rely on the
  85  * hash table mutexes for the bulk of their protection (i.e. most
  86  * fields in the arc_buf_hdr_t are protected by these mutexes).
  87  *
  88  * buf_hash_find() returns the appropriate mutex (held) when it
  89  * locates the requested buffer in the hash table.  It returns
  90  * NULL for the mutex if the buffer was not in the table.
  91  *
  92  * buf_hash_remove() expects the appropriate hash mutex to be
  93  * already held before it is invoked.
  94  *
  95  * Each arc state also has a mutex which is used to protect the
  96  * buffer list associated with the state.  When attempting to
  97  * obtain a hash table lock while holding an arc list lock you
  98  * must use: mutex_tryenter() to avoid deadlock.  Also note that
  99  * the active state mutex must be held before the ghost state mutex.
 100  *
 101  * Arc buffers may have an associated eviction callback function.
 102  * This function will be invoked prior to removing the buffer (e.g.
 103  * in arc_do_user_evicts()).  Note however that the data associated
 104  * with the buffer may be evicted prior to the callback.  The callback
 105  * must be made with *no locks held* (to prevent deadlock).  Additionally,
 106  * the users of callbacks must ensure that their private data is
 107  * protected from simultaneous callbacks from arc_clear_callback()
 108  * and arc_do_user_evicts().
 109  *
 110  * It as also possible to register a callback which is run when the
 111  * arc_meta_limit is reached and no buffers can be safely evicted.  In
 112  * this case the arc user should drop a reference on some arc buffers so
 113  * they can be reclaimed and the arc_meta_limit honored.  For example,
 114  * when using the ZPL each dentry holds a references on a znode.  These
 115  * dentries must be pruned before the arc buffer holding the znode can
 116  * be safely evicted.
 117  *
 118  * Note that the majority of the performance stats are manipulated
 119  * with atomic operations.
 120  *
 121  * The L2ARC uses the l2ad_mtx on each vdev for the following:
 122  *
 123  *      - L2ARC buflist creation
 124  *      - L2ARC buflist eviction
 125  *      - L2ARC write completion, which walks L2ARC buflists
 126  *      - ARC header destruction, as it removes from L2ARC buflists
 127  *      - ARC header release, as it removes from L2ARC buflists
 128  */
 129
 130 #include <sys/spa.h>
 131 #include <sys/zio.h>
 132 #include <sys/zio_compress.h>
 133 #include <sys/zfs_context.h>
 134 #include <sys/arc.h>
 135 #include <sys/vdev.h>
 136 #include <sys/vdev_impl.h>
 137 #include <sys/dsl_pool.h>
 138 #include <sys/multilist.h>
 139 #ifdef _KERNEL
 140 #include <sys/vmsystm.h>
 141 #include <vm/anon.h>
 142 #include <sys/fs/swapnode.h>
 143 #include <sys/zpl.h>
 144 #include <linux/mm_compat.h>
 145 #endif
 146 #include <sys/callb.h>
 147 #include <sys/kstat.h>
 148 #include <sys/dmu_tx.h>
 149 #include <zfs_fletcher.h>
 150 #include <sys/arc_impl.h>
 151 #include <sys/trace_arc.h>
 152
 153 #ifndef _KERNEL
 154 /* set with ZFS_DEBUG=watch, to enable watchpoints on frozen buffers */
 155 boolean_t arc_watch = B_FALSE;
 156 #endif
 157
 158 static kmutex_t         arc_reclaim_lock;
 159 static kcondvar_t       arc_reclaim_thread_cv;
 160 static boolean_t        arc_reclaim_thread_exit;
 161 static kcondvar_t       arc_reclaim_waiters_cv;
 162
 163 static kmutex_t         arc_user_evicts_lock;
 164 static kcondvar_t       arc_user_evicts_cv;
 165 static boolean_t        arc_user_evicts_thread_exit;
 166
 167 /* number of objects to prune from caches when arc_meta_limit is reached */
 168 int zfs_arc_meta_prune = 10000;
 169
 170 /* The preferred strategy to employ when arc_meta_limit is reached */
 171 int zfs_arc_meta_strategy = ARC_STRATEGY_META_BALANCED;
 172
 173 typedef enum arc_reclaim_strategy {
 174         ARC_RECLAIM_AGGR,               /* Aggressive reclaim strategy */
 175         ARC_RECLAIM_CONS                /* Conservative reclaim strategy */
 176 } arc_reclaim_strategy_t;
 177
 178 /*
 179  * The number of headers to evict in arc_evict_state_impl() before
 180  * dropping the sublist lock and evicting from another sublist. A lower
 181  * value means we're more likely to evict the "correct" header (i.e. the
 182  * oldest header in the arc state), but comes with higher overhead
 183  * (i.e. more invocations of arc_evict_state_impl()).
 184  */
 185 int zfs_arc_evict_batch_limit = 10;
 186
 187 /*
 188  * The number of sublists used for each of the arc state lists. If this
 189  * is not set to a suitable value by the user, it will be configured to
 190  * the number of CPUs on the system in arc_init().
 191  */
 192 int zfs_arc_num_sublists_per_state = 0;
 193
 194 /* number of seconds before growing cache again */
 195 int zfs_arc_grow_retry = 5;
 196
 197 /* shift of arc_c for calculating overflow limit in arc_get_data_buf */
 198 int zfs_arc_overflow_shift = 8;
 199
 200 /* disable anon data aggressively growing arc_p */
 201 int zfs_arc_p_aggressive_disable = 1;
 202
 203 /* disable arc_p adapt dampener in arc_adapt */
 204 int zfs_arc_p_dampener_disable = 1;
 205
 206 /* log2(fraction of arc to reclaim) */
 207 int zfs_arc_shrink_shift = 5;
 208
 209 /*
 210  * minimum lifespan of a prefetch block in clock ticks
 211  * (initialized in arc_init())
 212  */
 213 int zfs_arc_min_prefetch_lifespan = HZ;
 214
 215 /* disable arc proactive arc throttle due to low memory */
 216 int zfs_arc_memory_throttle_disable = 1;
 217
 218 /* disable duplicate buffer eviction */
 219 int zfs_disable_dup_eviction = 0;
 220
 221 /* average block used to size buf_hash_table */
 222 int zfs_arc_average_blocksize = 8 * 1024; /* 8KB */
 223
 224 /*
 225  * minimum lifespan of a prefetch block in clock ticks
 226  * (initialized in arc_init())
 227  */
 228 static int arc_min_prefetch_lifespan;
 229
 230 /*
 231  * If this percent of memory is free, don't throttle.
 232  */
 233 int arc_lotsfree_percent = 10;
 234
 235 static int arc_dead;
 236
 237 /* expiration time for arc_no_grow */
 238 static clock_t arc_grow_time = 0;
 239
 240 /*
 241  * The arc has filled available memory and has now warmed up.
 242  */
 243 static boolean_t arc_warm;
 244
 245 /*
 246  * These tunables are for performance analysis.
 247  */
 248 unsigned long zfs_arc_max = 0;
 249 unsigned long zfs_arc_min = 0;
 250 unsigned long zfs_arc_meta_limit = 0;
 251 unsigned long zfs_arc_meta_min = 0;
 252
 253 /*
 254  * Limit the number of restarts in arc_adjust_meta()
 255  */
 256 unsigned long zfs_arc_meta_adjust_restarts = 4096;
 257
 258 /* The 6 states: */
 259 static arc_state_t ARC_anon;
 260 static arc_state_t ARC_mru;
 261 static arc_state_t ARC_mru_ghost;
 262 static arc_state_t ARC_mfu;
 263 static arc_state_t ARC_mfu_ghost;
 264 static arc_state_t ARC_l2c_only;
 265
 266 typedef struct arc_stats {
 267         kstat_named_t arcstat_hits;
 268         kstat_named_t arcstat_misses;
 269         kstat_named_t arcstat_demand_data_hits;
 270         kstat_named_t arcstat_demand_data_misses;
 271         kstat_named_t arcstat_demand_metadata_hits;
 272         kstat_named_t arcstat_demand_metadata_misses;
 273         kstat_named_t arcstat_prefetch_data_hits;
 274         kstat_named_t arcstat_prefetch_data_misses;
 275         kstat_named_t arcstat_prefetch_metadata_hits;
 276         kstat_named_t arcstat_prefetch_metadata_misses;
 277         kstat_named_t arcstat_mru_hits;
 278         kstat_named_t arcstat_mru_ghost_hits;
 279         kstat_named_t arcstat_mfu_hits;
 280         kstat_named_t arcstat_mfu_ghost_hits;
 281         kstat_named_t arcstat_deleted;
 282         /*
 283          * Number of buffers that could not be evicted because the hash lock
 284          * was held by another thread.  The lock may not necessarily be held
 285          * by something using the same buffer, since hash locks are shared
 286          * by multiple buffers.
 287          */
 288         kstat_named_t arcstat_mutex_miss;
 289         /*
 290          * Number of buffers skipped because they have I/O in progress, are
 291          * indrect prefetch buffers that have not lived long enough, or are
 292          * not from the spa we're trying to evict from.
 293          */
 294         kstat_named_t arcstat_evict_skip;
 295         /*
 296          * Number of times arc_evict_state() was unable to evict enough
 297          * buffers to reach its target amount.
 298          */
 299         kstat_named_t arcstat_evict_not_enough;
 300         kstat_named_t arcstat_evict_l2_cached;
 301         kstat_named_t arcstat_evict_l2_eligible;
 302         kstat_named_t arcstat_evict_l2_ineligible;
 303         kstat_named_t arcstat_evict_l2_skip;
 304         kstat_named_t arcstat_hash_elements;
 305         kstat_named_t arcstat_hash_elements_max;
 306         kstat_named_t arcstat_hash_collisions;
 307         kstat_named_t arcstat_hash_chains;
 308         kstat_named_t arcstat_hash_chain_max;
 309         kstat_named_t arcstat_p;
 310         kstat_named_t arcstat_c;
 311         kstat_named_t arcstat_c_min;
 312         kstat_named_t arcstat_c_max;
 313         kstat_named_t arcstat_size;
 314         kstat_named_t arcstat_hdr_size;
 315         kstat_named_t arcstat_data_size;
 316         kstat_named_t arcstat_meta_size;
 317         kstat_named_t arcstat_other_size;
 318         kstat_named_t arcstat_anon_size;
 319         kstat_named_t arcstat_anon_evict_data;
 320         kstat_named_t arcstat_anon_evict_metadata;
 321         kstat_named_t arcstat_mru_size;
 322         kstat_named_t arcstat_mru_evict_data;
 323         kstat_named_t arcstat_mru_evict_metadata;
 324         kstat_named_t arcstat_mru_ghost_size;
 325         kstat_named_t arcstat_mru_ghost_evict_data;
 326         kstat_named_t arcstat_mru_ghost_evict_metadata;
 327         kstat_named_t arcstat_mfu_size;
 328         kstat_named_t arcstat_mfu_evict_data;
 329         kstat_named_t arcstat_mfu_evict_metadata;
 330         kstat_named_t arcstat_mfu_ghost_size;
 331         kstat_named_t arcstat_mfu_ghost_evict_data;
 332         kstat_named_t arcstat_mfu_ghost_evict_metadata;
 333         kstat_named_t arcstat_l2_hits;
 334         kstat_named_t arcstat_l2_misses;
 335         kstat_named_t arcstat_l2_feeds;
 336         kstat_named_t arcstat_l2_rw_clash;
 337         kstat_named_t arcstat_l2_read_bytes;
 338         kstat_named_t arcstat_l2_write_bytes;
 339         kstat_named_t arcstat_l2_writes_sent;
 340         kstat_named_t arcstat_l2_writes_done;
 341         kstat_named_t arcstat_l2_writes_error;
 342         kstat_named_t arcstat_l2_writes_lock_retry;
 343         kstat_named_t arcstat_l2_evict_lock_retry;
 344         kstat_named_t arcstat_l2_evict_reading;
 345         kstat_named_t arcstat_l2_evict_l1cached;
 346         kstat_named_t arcstat_l2_free_on_write;
 347         kstat_named_t arcstat_l2_cdata_free_on_write;
 348         kstat_named_t arcstat_l2_abort_lowmem;
 349         kstat_named_t arcstat_l2_cksum_bad;
 350         kstat_named_t arcstat_l2_io_error;
 351         kstat_named_t arcstat_l2_size;
 352         kstat_named_t arcstat_l2_asize;
 353         kstat_named_t arcstat_l2_hdr_size;
 354         kstat_named_t arcstat_l2_compress_successes;
 355         kstat_named_t arcstat_l2_compress_zeros;
 356         kstat_named_t arcstat_l2_compress_failures;
 357         kstat_named_t arcstat_memory_throttle_count;
 358         kstat_named_t arcstat_duplicate_buffers;
 359         kstat_named_t arcstat_duplicate_buffers_size;
 360         kstat_named_t arcstat_duplicate_reads;
 361         kstat_named_t arcstat_memory_direct_count;
 362         kstat_named_t arcstat_memory_indirect_count;
 363         kstat_named_t arcstat_no_grow;
 364         kstat_named_t arcstat_tempreserve;
 365         kstat_named_t arcstat_loaned_bytes;
 366         kstat_named_t arcstat_prune;
 367         kstat_named_t arcstat_meta_used;
 368         kstat_named_t arcstat_meta_limit;
 369         kstat_named_t arcstat_meta_max;
 370         kstat_named_t arcstat_meta_min;
 371 } arc_stats_t;
 372
 373 static arc_stats_t arc_stats = {
 374         { "hits",                       KSTAT_DATA_UINT64 },
 375         { "misses",                     KSTAT_DATA_UINT64 },
 376         { "demand_data_hits",           KSTAT_DATA_UINT64 },
 377         { "demand_data_misses",         KSTAT_DATA_UINT64 },
 378         { "demand_metadata_hits",       KSTAT_DATA_UINT64 },
 379         { "demand_metadata_misses",     KSTAT_DATA_UINT64 },
 380         { "prefetch_data_hits",         KSTAT_DATA_UINT64 },
 381         { "prefetch_data_misses",       KSTAT_DATA_UINT64 },
 382         { "prefetch_metadata_hits",     KSTAT_DATA_UINT64 },
 383         { "prefetch_metadata_misses",   KSTAT_DATA_UINT64 },
 384         { "mru_hits",                   KSTAT_DATA_UINT64 },
 385         { "mru_ghost_hits",             KSTAT_DATA_UINT64 },
 386         { "mfu_hits",                   KSTAT_DATA_UINT64 },
 387         { "mfu_ghost_hits",             KSTAT_DATA_UINT64 },
 388         { "deleted",                    KSTAT_DATA_UINT64 },
 389         { "mutex_miss",                 KSTAT_DATA_UINT64 },
 390         { "evict_skip",                 KSTAT_DATA_UINT64 },
 391         { "evict_not_enough",           KSTAT_DATA_UINT64 },
 392         { "evict_l2_cached",            KSTAT_DATA_UINT64 },
 393         { "evict_l2_eligible",          KSTAT_DATA_UINT64 },
 394         { "evict_l2_ineligible",        KSTAT_DATA_UINT64 },
 395         { "evict_l2_skip",              KSTAT_DATA_UINT64 },
 396         { "hash_elements",              KSTAT_DATA_UINT64 },
 397         { "hash_elements_max",          KSTAT_DATA_UINT64 },
 398         { "hash_collisions",            KSTAT_DATA_UINT64 },
 399         { "hash_chains",                KSTAT_DATA_UINT64 },
 400         { "hash_chain_max",             KSTAT_DATA_UINT64 },
 401         { "p",                          KSTAT_DATA_UINT64 },
 402         { "c",                          KSTAT_DATA_UINT64 },
 403         { "c_min",                      KSTAT_DATA_UINT64 },
 404         { "c_max",                      KSTAT_DATA_UINT64 },
 405         { "size",                       KSTAT_DATA_UINT64 },
 406         { "hdr_size",                   KSTAT_DATA_UINT64 },
 407         { "data_size",                  KSTAT_DATA_UINT64 },
 408         { "meta_size",                  KSTAT_DATA_UINT64 },
 409         { "other_size",                 KSTAT_DATA_UINT64 },
 410         { "anon_size",                  KSTAT_DATA_UINT64 },
 411         { "anon_evict_data",            KSTAT_DATA_UINT64 },
 412         { "anon_evict_metadata",        KSTAT_DATA_UINT64 },
 413         { "mru_size",                   KSTAT_DATA_UINT64 },
 414         { "mru_evict_data",             KSTAT_DATA_UINT64 },
 415         { "mru_evict_metadata",         KSTAT_DATA_UINT64 },
 416         { "mru_ghost_size",             KSTAT_DATA_UINT64 },
 417         { "mru_ghost_evict_data",       KSTAT_DATA_UINT64 },
 418         { "mru_ghost_evict_metadata",   KSTAT_DATA_UINT64 },
 419         { "mfu_size",                   KSTAT_DATA_UINT64 },
 420         { "mfu_evict_data",             KSTAT_DATA_UINT64 },
 421         { "mfu_evict_metadata",         KSTAT_DATA_UINT64 },
 422         { "mfu_ghost_size",             KSTAT_DATA_UINT64 },
 423         { "mfu_ghost_evict_data",       KSTAT_DATA_UINT64 },
 424         { "mfu_ghost_evict_metadata",   KSTAT_DATA_UINT64 },
 425         { "l2_hits",                    KSTAT_DATA_UINT64 },
 426         { "l2_misses",                  KSTAT_DATA_UINT64 },
 427         { "l2_feeds",                   KSTAT_DATA_UINT64 },
 428         { "l2_rw_clash",                KSTAT_DATA_UINT64 },
 429         { "l2_read_bytes",              KSTAT_DATA_UINT64 },
 430         { "l2_write_bytes",             KSTAT_DATA_UINT64 },
 431         { "l2_writes_sent",             KSTAT_DATA_UINT64 },
 432         { "l2_writes_done",             KSTAT_DATA_UINT64 },
 433         { "l2_writes_error",            KSTAT_DATA_UINT64 },
 434         { "l2_writes_lock_retry",       KSTAT_DATA_UINT64 },
 435         { "l2_evict_lock_retry",        KSTAT_DATA_UINT64 },
 436         { "l2_evict_reading",           KSTAT_DATA_UINT64 },
 437         { "l2_evict_l1cached",          KSTAT_DATA_UINT64 },
 438         { "l2_free_on_write",           KSTAT_DATA_UINT64 },
 439         { "l2_cdata_free_on_write",     KSTAT_DATA_UINT64 },
 440         { "l2_abort_lowmem",            KSTAT_DATA_UINT64 },
 441         { "l2_cksum_bad",               KSTAT_DATA_UINT64 },
 442         { "l2_io_error",                KSTAT_DATA_UINT64 },
 443         { "l2_size",                    KSTAT_DATA_UINT64 },
 444         { "l2_asize",                   KSTAT_DATA_UINT64 },
 445         { "l2_hdr_size",                KSTAT_DATA_UINT64 },
 446         { "l2_compress_successes",      KSTAT_DATA_UINT64 },
 447         { "l2_compress_zeros",          KSTAT_DATA_UINT64 },
 448         { "l2_compress_failures",       KSTAT_DATA_UINT64 },
 449         { "memory_throttle_count",      KSTAT_DATA_UINT64 },
 450         { "duplicate_buffers",          KSTAT_DATA_UINT64 },
 451         { "duplicate_buffers_size",     KSTAT_DATA_UINT64 },
 452         { "duplicate_reads",            KSTAT_DATA_UINT64 },
 453         { "memory_direct_count",        KSTAT_DATA_UINT64 },
 454         { "memory_indirect_count",      KSTAT_DATA_UINT64 },
 455         { "arc_no_grow",                KSTAT_DATA_UINT64 },
 456         { "arc_tempreserve",            KSTAT_DATA_UINT64 },
 457         { "arc_loaned_bytes",           KSTAT_DATA_UINT64 },
 458         { "arc_prune",                  KSTAT_DATA_UINT64 },
 459         { "arc_meta_used",              KSTAT_DATA_UINT64 },
 460         { "arc_meta_limit",             KSTAT_DATA_UINT64 },
 461         { "arc_meta_max",               KSTAT_DATA_UINT64 },
 462         { "arc_meta_min",               KSTAT_DATA_UINT64 },
 463 };
 464
 465 #define ARCSTAT(stat)   (arc_stats.stat.value.ui64)
 466
 467 #define ARCSTAT_INCR(stat, val) \
 468         atomic_add_64(&arc_stats.stat.value.ui64, (val))
 469
 470 #define ARCSTAT_BUMP(stat)      ARCSTAT_INCR(stat, 1)
 471 #define ARCSTAT_BUMPDOWN(stat)  ARCSTAT_INCR(stat, -1)
 472
 473 #define ARCSTAT_MAX(stat, val) {                                        \
 474         uint64_t m;                                                     \
 475         while ((val) > (m = arc_stats.stat.value.ui64) &&               \
 476             (m != atomic_cas_64(&arc_stats.stat.value.ui64, m, (val)))) \
 477                 continue;                                               \
 478 }
 479
 480 #define ARCSTAT_MAXSTAT(stat) \
 481         ARCSTAT_MAX(stat##_max, arc_stats.stat.value.ui64)
 482
 483 /*
 484  * We define a macro to allow ARC hits/misses to be easily broken down by
 485  * two separate conditions, giving a total of four different subtypes for
 486  * each of hits and misses (so eight statistics total).
 487  */
 488 #define ARCSTAT_CONDSTAT(cond1, stat1, notstat1, cond2, stat2, notstat2, stat) \
 489         if (cond1) {                                                    \
 490                 if (cond2) {                                            \
 491                         ARCSTAT_BUMP(arcstat_##stat1##_##stat2##_##stat); \
 492                 } else {                                                \
 493                         ARCSTAT_BUMP(arcstat_##stat1##_##notstat2##_##stat); \
 494                 }                                                       \
 495         } else {                                                        \
 496                 if (cond2) {                                            \
 497                         ARCSTAT_BUMP(arcstat_##notstat1##_##stat2##_##stat); \
 498                 } else {                                                \
 499                         ARCSTAT_BUMP(arcstat_##notstat1##_##notstat2##_##stat);\
 500                 }                                                       \
 501         }
 502
 503 kstat_t                 *arc_ksp;
 504 static arc_state_t      *arc_anon;
 505 static arc_state_t      *arc_mru;
 506 static arc_state_t      *arc_mru_ghost;
 507 static arc_state_t      *arc_mfu;
 508 static arc_state_t      *arc_mfu_ghost;
 509 static arc_state_t      *arc_l2c_only;
 510
 511 /*
 512  * There are several ARC variables that are critical to export as kstats --
 513  * but we don't want to have to grovel around in the kstat whenever we wish to
 514  * manipulate them.  For these variables, we therefore define them to be in
 515  * terms of the statistic variable.  This assures that we are not introducing
 516  * the possibility of inconsistency by having shadow copies of the variables,
 517  * while still allowing the code to be readable.
 518  */
 519 #define arc_size        ARCSTAT(arcstat_size)   /* actual total arc size */
 520 #define arc_p           ARCSTAT(arcstat_p)      /* target size of MRU */
 521 #define arc_c           ARCSTAT(arcstat_c)      /* target size of cache */
 522 #define arc_c_min       ARCSTAT(arcstat_c_min)  /* min target cache size */
 523 #define arc_c_max       ARCSTAT(arcstat_c_max)  /* max target cache size */
 524 #define arc_no_grow     ARCSTAT(arcstat_no_grow)
 525 #define arc_tempreserve ARCSTAT(arcstat_tempreserve)
 526 #define arc_loaned_bytes        ARCSTAT(arcstat_loaned_bytes)
 527 #define arc_meta_limit  ARCSTAT(arcstat_meta_limit) /* max size for metadata */
 528 #define arc_meta_min    ARCSTAT(arcstat_meta_min) /* min size for metadata */
 529 #define arc_meta_used   ARCSTAT(arcstat_meta_used) /* size of metadata */
 530 #define arc_meta_max    ARCSTAT(arcstat_meta_max) /* max size of metadata */
 531
 532 #define L2ARC_IS_VALID_COMPRESS(_c_) \
 533         ((_c_) == ZIO_COMPRESS_LZ4 || (_c_) == ZIO_COMPRESS_EMPTY)
 534
 535 static list_t arc_prune_list;
 536 static kmutex_t arc_prune_mtx;
 537 static taskq_t *arc_prune_taskq;
 538 static arc_buf_t *arc_eviction_list;
 539 static arc_buf_hdr_t arc_eviction_hdr;
 540
 541 #define GHOST_STATE(state)      \
 542         ((state) == arc_mru_ghost || (state) == arc_mfu_ghost ||        \
 543         (state) == arc_l2c_only)
 544
 545 #define HDR_IN_HASH_TABLE(hdr)  ((hdr)->b_flags & ARC_FLAG_IN_HASH_TABLE)
 546 #define HDR_IO_IN_PROGRESS(hdr) ((hdr)->b_flags & ARC_FLAG_IO_IN_PROGRESS)
 547 #define HDR_IO_ERROR(hdr)       ((hdr)->b_flags & ARC_FLAG_IO_ERROR)
 548 #define HDR_PREFETCH(hdr)       ((hdr)->b_flags & ARC_FLAG_PREFETCH)
 549 #define HDR_FREED_IN_READ(hdr)  ((hdr)->b_flags & ARC_FLAG_FREED_IN_READ)
 550 #define HDR_BUF_AVAILABLE(hdr)  ((hdr)->b_flags & ARC_FLAG_BUF_AVAILABLE)
 551
 552 #define HDR_L2CACHE(hdr)        ((hdr)->b_flags & ARC_FLAG_L2CACHE)
 553 #define HDR_L2COMPRESS(hdr)     ((hdr)->b_flags & ARC_FLAG_L2COMPRESS)
 554 #define HDR_L2_READING(hdr)     \
 555             (((hdr)->b_flags & ARC_FLAG_IO_IN_PROGRESS) &&      \
 556             ((hdr)->b_flags & ARC_FLAG_HAS_L2HDR))
 557 #define HDR_L2_WRITING(hdr)     ((hdr)->b_flags & ARC_FLAG_L2_WRITING)
 558 #define HDR_L2_EVICTED(hdr)     ((hdr)->b_flags & ARC_FLAG_L2_EVICTED)
 559 #define HDR_L2_WRITE_HEAD(hdr)  ((hdr)->b_flags & ARC_FLAG_L2_WRITE_HEAD)
 560
 561 #define HDR_ISTYPE_METADATA(hdr)        \
 562             ((hdr)->b_flags & ARC_FLAG_BUFC_METADATA)
 563 #define HDR_ISTYPE_DATA(hdr)    (!HDR_ISTYPE_METADATA(hdr))
 564
 565 #define HDR_HAS_L1HDR(hdr)      ((hdr)->b_flags & ARC_FLAG_HAS_L1HDR)
 566 #define HDR_HAS_L2HDR(hdr)      ((hdr)->b_flags & ARC_FLAG_HAS_L2HDR)
 567
 568 /* For storing compression mode in b_flags */
 569 #define HDR_COMPRESS_OFFSET     24
 570 #define HDR_COMPRESS_NBITS      7
 571
 572 #define HDR_GET_COMPRESS(hdr)   ((enum zio_compress)BF32_GET(hdr->b_flags, \
 573             HDR_COMPRESS_OFFSET, HDR_COMPRESS_NBITS))
 574 #define HDR_SET_COMPRESS(hdr, cmp) BF32_SET(hdr->b_flags, \
 575             HDR_COMPRESS_OFFSET, HDR_COMPRESS_NBITS, (cmp))
 576
 577 /*
 578  * Other sizes
 579  */
 580
 581 #define HDR_FULL_SIZE ((int64_t)sizeof (arc_buf_hdr_t))
 582 #define HDR_L2ONLY_SIZE ((int64_t)offsetof(arc_buf_hdr_t, b_l1hdr))
 583
 584 /*
 585  * Hash table routines
 586  */
 587
 588 #define HT_LOCK_ALIGN   64
 589 #define HT_LOCK_PAD     (P2NPHASE(sizeof (kmutex_t), (HT_LOCK_ALIGN)))
 590
 591 struct ht_lock {
 592         kmutex_t        ht_lock;
 593 #ifdef _KERNEL
 594         unsigned char   pad[HT_LOCK_PAD];
 595 #endif
 596 };
 597
 598 #define BUF_LOCKS 8192
 599 typedef struct buf_hash_table {
 600         uint64_t ht_mask;
 601         arc_buf_hdr_t **ht_table;
 602         struct ht_lock ht_locks[BUF_LOCKS];
 603 } buf_hash_table_t;
 604
 605 static buf_hash_table_t buf_hash_table;
 606
 607 #define BUF_HASH_INDEX(spa, dva, birth) \
 608         (buf_hash(spa, dva, birth) & buf_hash_table.ht_mask)
 609 #define BUF_HASH_LOCK_NTRY(idx) (buf_hash_table.ht_locks[idx & (BUF_LOCKS-1)])
 610 #define BUF_HASH_LOCK(idx)      (&(BUF_HASH_LOCK_NTRY(idx).ht_lock))
 611 #define HDR_LOCK(hdr) \
 612         (BUF_HASH_LOCK(BUF_HASH_INDEX(hdr->b_spa, &hdr->b_dva, hdr->b_birth)))
 613
 614 uint64_t zfs_crc64_table[256];
 615
 616 /*
 617  * Level 2 ARC
 618  */
 619
 620 #define L2ARC_WRITE_SIZE        (8 * 1024 * 1024)       /* initial write max */
 621 #define L2ARC_HEADROOM          2                       /* num of writes */
 622 /*
 623  * If we discover during ARC scan any buffers to be compressed, we boost
 624  * our headroom for the next scanning cycle by this percentage multiple.
 625  */
 626 #define L2ARC_HEADROOM_BOOST    200
 627 #define L2ARC_FEED_SECS         1               /* caching interval secs */
 628 #define L2ARC_FEED_MIN_MS       200             /* min caching interval ms */
 629
 630 #define l2arc_writes_sent       ARCSTAT(arcstat_l2_writes_sent)
 631 #define l2arc_writes_done       ARCSTAT(arcstat_l2_writes_done)
 632
 633 /* L2ARC Performance Tunables */
 634 unsigned long l2arc_write_max = L2ARC_WRITE_SIZE;       /* def max write size */
 635 unsigned long l2arc_write_boost = L2ARC_WRITE_SIZE;     /* extra warmup write */
 636 unsigned long l2arc_headroom = L2ARC_HEADROOM;          /* # of dev writes */
 637 unsigned long l2arc_headroom_boost = L2ARC_HEADROOM_BOOST;
 638 unsigned long l2arc_feed_secs = L2ARC_FEED_SECS;        /* interval seconds */
 639 unsigned long l2arc_feed_min_ms = L2ARC_FEED_MIN_MS;    /* min interval msecs */
 640 int l2arc_noprefetch = B_TRUE;                  /* don't cache prefetch bufs */
 641 int l2arc_nocompress = B_FALSE;                 /* don't compress bufs */
 642 int l2arc_feed_again = B_TRUE;                  /* turbo warmup */
 643 int l2arc_norw = B_FALSE;                       /* no reads during writes */
 644
 645 /*
 646  * L2ARC Internals
 647  */
 648 static list_t L2ARC_dev_list;                   /* device list */
 649 static list_t *l2arc_dev_list;                  /* device list pointer */
 650 static kmutex_t l2arc_dev_mtx;                  /* device list mutex */
 651 static l2arc_dev_t *l2arc_dev_last;             /* last device used */
 652 static list_t L2ARC_free_on_write;              /* free after write buf list */
 653 static list_t *l2arc_free_on_write;             /* free after write list ptr */
 654 static kmutex_t l2arc_free_on_write_mtx;        /* mutex for list */
 655 static uint64_t l2arc_ndev;                     /* number of devices */
 656
 657 typedef struct l2arc_read_callback {
 658         arc_buf_t               *l2rcb_buf;             /* read buffer */
 659         spa_t                   *l2rcb_spa;             /* spa */
 660         blkptr_t                l2rcb_bp;               /* original blkptr */
 661         zbookmark_phys_t        l2rcb_zb;               /* original bookmark */
 662         int                     l2rcb_flags;            /* original flags */
 663         enum zio_compress       l2rcb_compress;         /* applied compress */
 664 } l2arc_read_callback_t;
 665
 666 typedef struct l2arc_data_free {
 667         /* protected by l2arc_free_on_write_mtx */
 668         void            *l2df_data;
 669         size_t          l2df_size;
 670         void            (*l2df_func)(void *, size_t);
 671         list_node_t     l2df_list_node;
 672 } l2arc_data_free_t;
 673
 674 static kmutex_t l2arc_feed_thr_lock;
 675 static kcondvar_t l2arc_feed_thr_cv;
 676 static uint8_t l2arc_thread_exit;
 677
 678 static void arc_get_data_buf(arc_buf_t *);
 679 static void arc_access(arc_buf_hdr_t *, kmutex_t *);
 680 static boolean_t arc_is_overflowing(void);
 681 static void arc_buf_watch(arc_buf_t *);
 682
 683 static arc_buf_contents_t arc_buf_type(arc_buf_hdr_t *);
 684 static uint32_t arc_bufc_to_flags(arc_buf_contents_t);
 685
 686 static boolean_t l2arc_write_eligible(uint64_t, arc_buf_hdr_t *);
 687 static void l2arc_read_done(zio_t *);
 688
 689 static boolean_t l2arc_compress_buf(arc_buf_hdr_t *);
 690 static void l2arc_decompress_zio(zio_t *, arc_buf_hdr_t *, enum zio_compress);
 691 static void l2arc_release_cdata_buf(arc_buf_hdr_t *);
 692
 693 static uint64_t
 694 buf_hash(uint64_t spa, const dva_t *dva, uint64_t birth)
 695 {
 696         uint8_t *vdva = (uint8_t *)dva;
 697         uint64_t crc = -1ULL;
 698         int i;
 699
 700         ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY);
 701
 702         for (i = 0; i < sizeof (dva_t); i++)
 703                 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ vdva[i]) & 0xFF];
 704
 705         crc ^= (spa>>8) ^ birth;
 706
 707         return (crc);
 708 }
 709
 710 #define BUF_EMPTY(buf)                                          \
 711         ((buf)->b_dva.dva_word[0] == 0 &&                       \
 712         (buf)->b_dva.dva_word[1] == 0)
 713
 714 #define BUF_EQUAL(spa, dva, birth, buf)                         \
 715         ((buf)->b_dva.dva_word[0] == (dva)->dva_word[0]) &&     \
 716         ((buf)->b_dva.dva_word[1] == (dva)->dva_word[1]) &&     \
 717         ((buf)->b_birth == birth) && ((buf)->b_spa == spa)
 718
 719 static void
 720 buf_discard_identity(arc_buf_hdr_t *hdr)
 721 {
 722         hdr->b_dva.dva_word[0] = 0;
 723         hdr->b_dva.dva_word[1] = 0;
 724         hdr->b_birth = 0;
 725 }
 726
 727 static arc_buf_hdr_t *
 728 buf_hash_find(uint64_t spa, const blkptr_t *bp, kmutex_t **lockp)
 729 {
 730         const dva_t *dva = BP_IDENTITY(bp);
 731         uint64_t birth = BP_PHYSICAL_BIRTH(bp);
 732         uint64_t idx = BUF_HASH_INDEX(spa, dva, birth);
 733         kmutex_t *hash_lock = BUF_HASH_LOCK(idx);
 734         arc_buf_hdr_t *hdr;
 735
 736         mutex_enter(hash_lock);
 737         for (hdr = buf_hash_table.ht_table[idx]; hdr != NULL;
 738             hdr = hdr->b_hash_next) {
 739                 if (BUF_EQUAL(spa, dva, birth, hdr)) {
 740                         *lockp = hash_lock;
 741                         return (hdr);
 742                 }
 743         }
 744         mutex_exit(hash_lock);
 745         *lockp = NULL;
 746         return (NULL);
 747 }
 748
 749 /*
 750  * Insert an entry into the hash table.  If there is already an element
 751  * equal to elem in the hash table, then the already existing element
 752  * will be returned and the new element will not be inserted.
 753  * Otherwise returns NULL.
 754  * If lockp == NULL, the caller is assumed to already hold the hash lock.
 755  */
 756 static arc_buf_hdr_t *
 757 buf_hash_insert(arc_buf_hdr_t *hdr, kmutex_t **lockp)
 758 {
 759         uint64_t idx = BUF_HASH_INDEX(hdr->b_spa, &hdr->b_dva, hdr->b_birth);
 760         kmutex_t *hash_lock = BUF_HASH_LOCK(idx);
 761         arc_buf_hdr_t *fhdr;
 762         uint32_t i;
 763
 764         ASSERT(!DVA_IS_EMPTY(&hdr->b_dva));
 765         ASSERT(hdr->b_birth != 0);
 766         ASSERT(!HDR_IN_HASH_TABLE(hdr));
 767
 768         if (lockp != NULL) {
 769                 *lockp = hash_lock;
 770                 mutex_enter(hash_lock);
 771         } else {
 772                 ASSERT(MUTEX_HELD(hash_lock));
 773         }
 774
 775         for (fhdr = buf_hash_table.ht_table[idx], i = 0; fhdr != NULL;
 776             fhdr = fhdr->b_hash_next, i++) {
 777                 if (BUF_EQUAL(hdr->b_spa, &hdr->b_dva, hdr->b_birth, fhdr))
 778                         return (fhdr);
 779         }
 780
 781         hdr->b_hash_next = buf_hash_table.ht_table[idx];
 782         buf_hash_table.ht_table[idx] = hdr;
 783         hdr->b_flags |= ARC_FLAG_IN_HASH_TABLE;
 784
 785         /* collect some hash table performance data */
 786         if (i > 0) {
 787                 ARCSTAT_BUMP(arcstat_hash_collisions);
 788                 if (i == 1)
 789                         ARCSTAT_BUMP(arcstat_hash_chains);
 790
 791                 ARCSTAT_MAX(arcstat_hash_chain_max, i);
 792         }
 793
 794         ARCSTAT_BUMP(arcstat_hash_elements);
 795         ARCSTAT_MAXSTAT(arcstat_hash_elements);
 796
 797         return (NULL);
 798 }
 799
 800 static void
 801 buf_hash_remove(arc_buf_hdr_t *hdr)
 802 {
 803         arc_buf_hdr_t *fhdr, **hdrp;
 804         uint64_t idx = BUF_HASH_INDEX(hdr->b_spa, &hdr->b_dva, hdr->b_birth);
 805
 806         ASSERT(MUTEX_HELD(BUF_HASH_LOCK(idx)));
 807         ASSERT(HDR_IN_HASH_TABLE(hdr));
 808
 809         hdrp = &buf_hash_table.ht_table[idx];
 810         while ((fhdr = *hdrp) != hdr) {
 811                 ASSERT(fhdr != NULL);
 812                 hdrp = &fhdr->b_hash_next;
 813         }
 814         *hdrp = hdr->b_hash_next;
 815         hdr->b_hash_next = NULL;
 816         hdr->b_flags &= ~ARC_FLAG_IN_HASH_TABLE;
 817
 818         /* collect some hash table performance data */
 819         ARCSTAT_BUMPDOWN(arcstat_hash_elements);
 820
 821         if (buf_hash_table.ht_table[idx] &&
 822             buf_hash_table.ht_table[idx]->b_hash_next == NULL)
 823                 ARCSTAT_BUMPDOWN(arcstat_hash_chains);
 824 }
 825
 826 /*
 827  * Global data structures and functions for the buf kmem cache.
 828  */
 829 static kmem_cache_t *hdr_full_cache;
 830 static kmem_cache_t *hdr_l2only_cache;
 831 static kmem_cache_t *buf_cache;
 832
 833 static void
 834 buf_fini(void)
 835 {
 836         int i;
 837
 838 #if defined(_KERNEL) && defined(HAVE_SPL)
 839         /*
 840          * Large allocations which do not require contiguous pages
 841          * should be using vmem_free() in the linux kernel\
 842          */
 843         vmem_free(buf_hash_table.ht_table,
 844             (buf_hash_table.ht_mask + 1) * sizeof (void *));
 845 #else
 846         kmem_free(buf_hash_table.ht_table,
 847             (buf_hash_table.ht_mask + 1) * sizeof (void *));
 848 #endif
 849         for (i = 0; i < BUF_LOCKS; i++)
 850                 mutex_destroy(&buf_hash_table.ht_locks[i].ht_lock);
 851         kmem_cache_destroy(hdr_full_cache);
 852         kmem_cache_destroy(hdr_l2only_cache);
 853         kmem_cache_destroy(buf_cache);
 854 }
 855
 856 /*
 857  * Constructor callback - called when the cache is empty
 858  * and a new buf is requested.
 859  */
 860 /* ARGSUSED */
 861 static int
 862 hdr_full_cons(void *vbuf, void *unused, int kmflag)
 863 {
 864         arc_buf_hdr_t *hdr = vbuf;
 865
 866         bzero(hdr, HDR_FULL_SIZE);
 867         cv_init(&hdr->b_l1hdr.b_cv, NULL, CV_DEFAULT, NULL);
 868         refcount_create(&hdr->b_l1hdr.b_refcnt);
 869         mutex_init(&hdr->b_l1hdr.b_freeze_lock, NULL, MUTEX_DEFAULT, NULL);
 870         list_link_init(&hdr->b_l1hdr.b_arc_node);
 871         list_link_init(&hdr->b_l2hdr.b_l2node);
 872         multilist_link_init(&hdr->b_l1hdr.b_arc_node);
 873         arc_space_consume(HDR_FULL_SIZE, ARC_SPACE_HDRS);
 874
 875         return (0);
 876 }
 877
 878 /* ARGSUSED */
 879 static int
 880 hdr_l2only_cons(void *vbuf, void *unused, int kmflag)
 881 {
 882         arc_buf_hdr_t *hdr = vbuf;
 883
 884         bzero(hdr, HDR_L2ONLY_SIZE);
 885         arc_space_consume(HDR_L2ONLY_SIZE, ARC_SPACE_L2HDRS);
 886
 887         return (0);
 888 }
 889
 890 /* ARGSUSED */
 891 static int
 892 buf_cons(void *vbuf, void *unused, int kmflag)
 893 {
 894         arc_buf_t *buf = vbuf;
 895
 896         bzero(buf, sizeof (arc_buf_t));
 897         mutex_init(&buf->b_evict_lock, NULL, MUTEX_DEFAULT, NULL);
 898         arc_space_consume(sizeof (arc_buf_t), ARC_SPACE_HDRS);
 899
 900         return (0);
 901 }
 902
 903 /*
 904  * Destructor callback - called when a cached buf is
 905  * no longer required.
 906  */
 907 /* ARGSUSED */
 908 static void
 909 hdr_full_dest(void *vbuf, void *unused)
 910 {
 911         arc_buf_hdr_t *hdr = vbuf;
 912
 913         ASSERT(BUF_EMPTY(hdr));
 914         cv_destroy(&hdr->b_l1hdr.b_cv);
 915         refcount_destroy(&hdr->b_l1hdr.b_refcnt);
 916         mutex_destroy(&hdr->b_l1hdr.b_freeze_lock);
 917         ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node));
 918         arc_space_return(HDR_FULL_SIZE, ARC_SPACE_HDRS);
 919 }
 920
 921 /* ARGSUSED */
 922 static void
 923 hdr_l2only_dest(void *vbuf, void *unused)
 924 {
 925         ASSERTV(arc_buf_hdr_t *hdr = vbuf);
 926
 927         ASSERT(BUF_EMPTY(hdr));
 928         arc_space_return(HDR_L2ONLY_SIZE, ARC_SPACE_L2HDRS);
 929 }
 930
 931 /* ARGSUSED */
 932 static void
 933 buf_dest(void *vbuf, void *unused)
 934 {
 935         arc_buf_t *buf = vbuf;
 936
 937         mutex_destroy(&buf->b_evict_lock);
 938         arc_space_return(sizeof (arc_buf_t), ARC_SPACE_HDRS);
 939 }
 940
 941 static void
 942 buf_init(void)
 943 {
 944         uint64_t *ct;
 945         uint64_t hsize = 1ULL << 12;
 946         int i, j;
 947
 948         /*
 949          * The hash table is big enough to fill all of physical memory
 950          * with an average block size of zfs_arc_average_blocksize (default 8K).
 951          * By default, the table will take up
 952          * totalmem * sizeof(void*) / 8K (1MB per GB with 8-byte pointers).
 953          */
 954         while (hsize * zfs_arc_average_blocksize < physmem * PAGESIZE)
 955                 hsize <<= 1;
 956 retry:
 957         buf_hash_table.ht_mask = hsize - 1;
 958 #if defined(_KERNEL) && defined(HAVE_SPL)
 959         /*
 960          * Large allocations which do not require contiguous pages
 961          * should be using vmem_alloc() in the linux kernel
 962          */
 963         buf_hash_table.ht_table =
 964             vmem_zalloc(hsize * sizeof (void*), KM_SLEEP);
 965 #else
 966         buf_hash_table.ht_table =
 967             kmem_zalloc(hsize * sizeof (void*), KM_NOSLEEP);
 968 #endif
 969         if (buf_hash_table.ht_table == NULL) {
 970                 ASSERT(hsize > (1ULL << 8));
 971                 hsize >>= 1;
 972                 goto retry;
 973         }
 974
 975         hdr_full_cache = kmem_cache_create("arc_buf_hdr_t_full", HDR_FULL_SIZE,
 976             0, hdr_full_cons, hdr_full_dest, NULL, NULL, NULL, 0);
 977         hdr_l2only_cache = kmem_cache_create("arc_buf_hdr_t_l2only",
 978             HDR_L2ONLY_SIZE, 0, hdr_l2only_cons, hdr_l2only_dest, NULL,
 979             NULL, NULL, 0);
 980         buf_cache = kmem_cache_create("arc_buf_t", sizeof (arc_buf_t),
 981             0, buf_cons, buf_dest, NULL, NULL, NULL, 0);
 982
 983         for (i = 0; i < 256; i++)
 984                 for (ct = zfs_crc64_table + i, *ct = i, j = 8; j > 0; j--)
 985                         *ct = (*ct >> 1) ^ (-(*ct & 1) & ZFS_CRC64_POLY);
 986
 987         for (i = 0; i < BUF_LOCKS; i++) {
 988                 mutex_init(&buf_hash_table.ht_locks[i].ht_lock,
 989                     NULL, MUTEX_DEFAULT, NULL);
 990         }
 991 }
 992
 993 /*
 994  * Transition between the two allocation states for the arc_buf_hdr struct.
 995  * The arc_buf_hdr struct can be allocated with (hdr_full_cache) or without
 996  * (hdr_l2only_cache) the fields necessary for the L1 cache - the smaller
 997  * version is used when a cache buffer is only in the L2ARC in order to reduce
 998  * memory usage.
 999  */
1000 static arc_buf_hdr_t *
1001 arc_hdr_realloc(arc_buf_hdr_t *hdr, kmem_cache_t *old, kmem_cache_t *new)
1002 {
1003         arc_buf_hdr_t *nhdr;
1004         l2arc_dev_t *dev;
1005
1006         ASSERT(HDR_HAS_L2HDR(hdr));
1007         ASSERT((old == hdr_full_cache && new == hdr_l2only_cache) ||
1008             (old == hdr_l2only_cache && new == hdr_full_cache));
1009
1010         dev = hdr->b_l2hdr.b_dev;
1011         nhdr = kmem_cache_alloc(new, KM_PUSHPAGE);
1012
1013         ASSERT(MUTEX_HELD(HDR_LOCK(hdr)));
1014         buf_hash_remove(hdr);
1015
1016         bcopy(hdr, nhdr, HDR_L2ONLY_SIZE);
1017         if (new == hdr_full_cache) {
1018                 nhdr->b_flags |= ARC_FLAG_HAS_L1HDR;
1019                 /*
1020                  * arc_access and arc_change_state need to be aware that a
1021                  * header has just come out of L2ARC, so we set its state to
1022                  * l2c_only even though it's about to change.
1023                  */
1024                 nhdr->b_l1hdr.b_state = arc_l2c_only;
1025
1026                 /* Verify previous threads set to NULL before freeing */
1027                 ASSERT3P(nhdr->b_l1hdr.b_tmp_cdata, ==, NULL);
1028         } else {
1029                 ASSERT(hdr->b_l1hdr.b_buf == NULL);
1030                 ASSERT0(hdr->b_l1hdr.b_datacnt);
1031
1032                 /*
1033                  * If we've reached here, We must have been called from
1034                  * arc_evict_hdr(), as such we should have already been
1035                  * removed from any ghost list we were previously on
1036                  * (which protects us from racing with arc_evict_state),
1037                  * thus no locking is needed during this check.
1038                  */
1039                 ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node));
1040
1041                 /*
1042                  * A buffer must not be moved into the arc_l2c_only
1043                  * state if it's not finished being written out to the
1044                  * l2arc device. Otherwise, the b_l1hdr.b_tmp_cdata field
1045                  * might try to be accessed, even though it was removed.
1046                  */
1047                 VERIFY(!HDR_L2_WRITING(hdr));
1048                 VERIFY3P(hdr->b_l1hdr.b_tmp_cdata, ==, NULL);
1049
1050                 nhdr->b_flags &= ~ARC_FLAG_HAS_L1HDR;
1051         }
1052         /*
1053          * The header has been reallocated so we need to re-insert it into any
1054          * lists it was on.
1055          */
1056         (void) buf_hash_insert(nhdr, NULL);
1057
1058         ASSERT(list_link_active(&hdr->b_l2hdr.b_l2node));
1059
1060         mutex_enter(&dev->l2ad_mtx);
1061
1062         /*
1063          * We must place the realloc'ed header back into the list at
1064          * the same spot. Otherwise, if it's placed earlier in the list,
1065          * l2arc_write_buffers() could find it during the function's
1066          * write phase, and try to write it out to the l2arc.
1067          */
1068         list_insert_after(&dev->l2ad_buflist, hdr, nhdr);
1069         list_remove(&dev->l2ad_buflist, hdr);
1070
1071         mutex_exit(&dev->l2ad_mtx);
1072
1073         buf_discard_identity(hdr);
1074         hdr->b_freeze_cksum = NULL;
1075         kmem_cache_free(old, hdr);
1076
1077         return (nhdr);
1078 }
1079
1080
1081 #define ARC_MINTIME     (hz>>4) /* 62 ms */
1082
1083 static void
1084 arc_cksum_verify(arc_buf_t *buf)
1085 {
1086         zio_cksum_t zc;
1087
1088         if (!(zfs_flags & ZFS_DEBUG_MODIFY))
1089                 return;
1090
1091         mutex_enter(&buf->b_hdr->b_l1hdr.b_freeze_lock);
1092         if (buf->b_hdr->b_freeze_cksum == NULL || HDR_IO_ERROR(buf->b_hdr)) {
1093                 mutex_exit(&buf->b_hdr->b_l1hdr.b_freeze_lock);
1094                 return;
1095         }
1096         fletcher_2_native(buf->b_data, buf->b_hdr->b_size, &zc);
1097         if (!ZIO_CHECKSUM_EQUAL(*buf->b_hdr->b_freeze_cksum, zc))
1098                 panic("buffer modified while frozen!");
1099         mutex_exit(&buf->b_hdr->b_l1hdr.b_freeze_lock);
1100 }
1101
1102 static int
1103 arc_cksum_equal(arc_buf_t *buf)
1104 {
1105         zio_cksum_t zc;
1106         int equal;
1107
1108         mutex_enter(&buf->b_hdr->b_l1hdr.b_freeze_lock);
1109         fletcher_2_native(buf->b_data, buf->b_hdr->b_size, &zc);
1110         equal = ZIO_CHECKSUM_EQUAL(*buf->b_hdr->b_freeze_cksum, zc);
1111         mutex_exit(&buf->b_hdr->b_l1hdr.b_freeze_lock);
1112
1113         return (equal);
1114 }
1115
1116 static void
1117 arc_cksum_compute(arc_buf_t *buf, boolean_t force)
1118 {
1119         if (!force && !(zfs_flags & ZFS_DEBUG_MODIFY))
1120                 return;
1121
1122         mutex_enter(&buf->b_hdr->b_l1hdr.b_freeze_lock);
1123         if (buf->b_hdr->b_freeze_cksum != NULL) {
1124                 mutex_exit(&buf->b_hdr->b_l1hdr.b_freeze_lock);
1125                 return;
1126         }
1127         buf->b_hdr->b_freeze_cksum = kmem_alloc(sizeof (zio_cksum_t),
1128             KM_SLEEP);
1129         fletcher_2_native(buf->b_data, buf->b_hdr->b_size,
1130             buf->b_hdr->b_freeze_cksum);
1131         mutex_exit(&buf->b_hdr->b_l1hdr.b_freeze_lock);
1132         arc_buf_watch(buf);
1133 }
1134
1135 #ifndef _KERNEL
1136 void
1137 arc_buf_sigsegv(int sig, siginfo_t *si, void *unused)
1138 {
1139         panic("Got SIGSEGV at address: 0x%lx\n", (long) si->si_addr);
1140 }
1141 #endif
1142
1143 /* ARGSUSED */
1144 static void
1145 arc_buf_unwatch(arc_buf_t *buf)
1146 {
1147 #ifndef _KERNEL
1148         if (arc_watch) {
1149                 ASSERT0(mprotect(buf->b_data, buf->b_hdr->b_size,
1150                     PROT_READ | PROT_WRITE));
1151         }
1152 #endif
1153 }
1154
1155 /* ARGSUSED */
1156 static void
1157 arc_buf_watch(arc_buf_t *buf)
1158 {
1159 #ifndef _KERNEL
1160         if (arc_watch)
1161                 ASSERT0(mprotect(buf->b_data, buf->b_hdr->b_size, PROT_READ));
1162 #endif
1163 }
1164
1165 static arc_buf_contents_t
1166 arc_buf_type(arc_buf_hdr_t *hdr)
1167 {
1168         if (HDR_ISTYPE_METADATA(hdr)) {
1169                 return (ARC_BUFC_METADATA);
1170         } else {
1171                 return (ARC_BUFC_DATA);
1172         }
1173 }
1174
1175 static uint32_t
1176 arc_bufc_to_flags(arc_buf_contents_t type)
1177 {
1178         switch (type) {
1179         case ARC_BUFC_DATA:
1180                 /* metadata field is 0 if buffer contains normal data */
1181                 return (0);
1182         case ARC_BUFC_METADATA:
1183                 return (ARC_FLAG_BUFC_METADATA);
1184         default:
1185                 break;
1186         }
1187         panic("undefined ARC buffer type!");
1188         return ((uint32_t)-1);
1189 }
1190
1191 void
1192 arc_buf_thaw(arc_buf_t *buf)
1193 {
1194         if (zfs_flags & ZFS_DEBUG_MODIFY) {
1195                 if (buf->b_hdr->b_l1hdr.b_state != arc_anon)
1196                         panic("modifying non-anon buffer!");
1197                 if (HDR_IO_IN_PROGRESS(buf->b_hdr))
1198                         panic("modifying buffer while i/o in progress!");
1199                 arc_cksum_verify(buf);
1200         }
1201
1202         mutex_enter(&buf->b_hdr->b_l1hdr.b_freeze_lock);
1203         if (buf->b_hdr->b_freeze_cksum != NULL) {
1204                 kmem_free(buf->b_hdr->b_freeze_cksum, sizeof (zio_cksum_t));
1205                 buf->b_hdr->b_freeze_cksum = NULL;
1206         }
1207
1208         mutex_exit(&buf->b_hdr->b_l1hdr.b_freeze_lock);
1209
1210         arc_buf_unwatch(buf);
1211 }
1212
1213 void
1214 arc_buf_freeze(arc_buf_t *buf)
1215 {
1216         kmutex_t *hash_lock;
1217
1218         if (!(zfs_flags & ZFS_DEBUG_MODIFY))
1219                 return;
1220
1221         hash_lock = HDR_LOCK(buf->b_hdr);
1222         mutex_enter(hash_lock);
1223
1224         ASSERT(buf->b_hdr->b_freeze_cksum != NULL ||
1225             buf->b_hdr->b_l1hdr.b_state == arc_anon);
1226         arc_cksum_compute(buf, B_FALSE);
1227         mutex_exit(hash_lock);
1228
1229 }
1230
1231 static void
1232 add_reference(arc_buf_hdr_t *hdr, kmutex_t *hash_lock, void *tag)
1233 {
1234         arc_state_t *state;
1235
1236         ASSERT(HDR_HAS_L1HDR(hdr));
1237         ASSERT(MUTEX_HELD(hash_lock));
1238
1239         state = hdr->b_l1hdr.b_state;
1240
1241         if ((refcount_add(&hdr->b_l1hdr.b_refcnt, tag) == 1) &&
1242             (state != arc_anon)) {
1243                 /* We don't use the L2-only state list. */
1244                 if (state != arc_l2c_only) {
1245                         arc_buf_contents_t type = arc_buf_type(hdr);
1246                         uint64_t delta = hdr->b_size * hdr->b_l1hdr.b_datacnt;
1247                         multilist_t *list = &state->arcs_list[type];
1248                         uint64_t *size = &state->arcs_lsize[type];
1249
1250                         multilist_remove(list, hdr);
1251
1252                         if (GHOST_STATE(state)) {
1253                                 ASSERT0(hdr->b_l1hdr.b_datacnt);
1254                                 ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL);
1255                                 delta = hdr->b_size;
1256                         }
1257                         ASSERT(delta > 0);
1258                         ASSERT3U(*size, >=, delta);
1259                         atomic_add_64(size, -delta);
1260                 }
1261                 /* remove the prefetch flag if we get a reference */
1262                 hdr->b_flags &= ~ARC_FLAG_PREFETCH;
1263         }
1264 }
1265
1266 static int
1267 remove_reference(arc_buf_hdr_t *hdr, kmutex_t *hash_lock, void *tag)
1268 {
1269         int cnt;
1270         arc_state_t *state = hdr->b_l1hdr.b_state;
1271
1272         ASSERT(HDR_HAS_L1HDR(hdr));
1273         ASSERT(state == arc_anon || MUTEX_HELD(hash_lock));
1274         ASSERT(!GHOST_STATE(state));
1275
1276         /*
1277          * arc_l2c_only counts as a ghost state so we don't need to explicitly
1278          * check to prevent usage of the arc_l2c_only list.
1279          */
1280         if (((cnt = refcount_remove(&hdr->b_l1hdr.b_refcnt, tag)) == 0) &&
1281             (state != arc_anon)) {
1282                 arc_buf_contents_t type = arc_buf_type(hdr);
1283                 multilist_t *list = &state->arcs_list[type];
1284                 uint64_t *size = &state->arcs_lsize[type];
1285
1286                 multilist_insert(list, hdr);
1287
1288                 ASSERT(hdr->b_l1hdr.b_datacnt > 0);
1289                 atomic_add_64(size, hdr->b_size *
1290                     hdr->b_l1hdr.b_datacnt);
1291         }
1292         return (cnt);
1293 }
1294
1295 /*
1296  * Returns detailed information about a specific arc buffer.  When the
1297  * state_index argument is set the function will calculate the arc header
1298  * list position for its arc state.  Since this requires a linear traversal
1299  * callers are strongly encourage not to do this.  However, it can be helpful
1300  * for targeted analysis so the functionality is provided.
1301  */
1302 void
1303 arc_buf_info(arc_buf_t *ab, arc_buf_info_t *abi, int state_index)
1304 {
1305         arc_buf_hdr_t *hdr = ab->b_hdr;
1306         l1arc_buf_hdr_t *l1hdr = NULL;
1307         l2arc_buf_hdr_t *l2hdr = NULL;
1308         arc_state_t *state = NULL;
1309
1310         if (HDR_HAS_L1HDR(hdr)) {
1311                 l1hdr = &hdr->b_l1hdr;
1312                 state = l1hdr->b_state;
1313         }
1314         if (HDR_HAS_L2HDR(hdr))
1315                 l2hdr = &hdr->b_l2hdr;
1316
1317         memset(abi, 0, sizeof (arc_buf_info_t));
1318         abi->abi_flags = hdr->b_flags;
1319
1320         if (l1hdr) {
1321                 abi->abi_datacnt = l1hdr->b_datacnt;
1322                 abi->abi_access = l1hdr->b_arc_access;
1323                 abi->abi_mru_hits = l1hdr->b_mru_hits;
1324                 abi->abi_mru_ghost_hits = l1hdr->b_mru_ghost_hits;
1325                 abi->abi_mfu_hits = l1hdr->b_mfu_hits;
1326                 abi->abi_mfu_ghost_hits = l1hdr->b_mfu_ghost_hits;
1327                 abi->abi_holds = refcount_count(&l1hdr->b_refcnt);
1328         }
1329
1330         if (l2hdr) {
1331                 abi->abi_l2arc_dattr = l2hdr->b_daddr;
1332                 abi->abi_l2arc_asize = l2hdr->b_asize;
1333                 abi->abi_l2arc_compress = HDR_GET_COMPRESS(hdr);
1334                 abi->abi_l2arc_hits = l2hdr->b_hits;
1335         }
1336
1337         abi->abi_state_type = state ? state->arcs_state : ARC_STATE_ANON;
1338         abi->abi_state_contents = arc_buf_type(hdr);
1339         abi->abi_size = hdr->b_size;
1340 }
1341
1342 /*
1343  * Move the supplied buffer to the indicated state. The hash lock
1344  * for the buffer must be held by the caller.
1345  */
1346 static void
1347 arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr,
1348     kmutex_t *hash_lock)
1349 {
1350         arc_state_t *old_state;
1351         int64_t refcnt;
1352         uint32_t datacnt;
1353         uint64_t from_delta, to_delta;
1354         arc_buf_contents_t buftype = arc_buf_type(hdr);
1355
1356         /*
1357          * We almost always have an L1 hdr here, since we call arc_hdr_realloc()
1358          * in arc_read() when bringing a buffer out of the L2ARC.  However, the
1359          * L1 hdr doesn't always exist when we change state to arc_anon before
1360          * destroying a header, in which case reallocating to add the L1 hdr is
1361          * pointless.
1362          */
1363         if (HDR_HAS_L1HDR(hdr)) {
1364                 old_state = hdr->b_l1hdr.b_state;
1365                 refcnt = refcount_count(&hdr->b_l1hdr.b_refcnt);
1366                 datacnt = hdr->b_l1hdr.b_datacnt;
1367         } else {
1368                 old_state = arc_l2c_only;
1369                 refcnt = 0;
1370                 datacnt = 0;
1371         }
1372
1373         ASSERT(MUTEX_HELD(hash_lock));
1374         ASSERT3P(new_state, !=, old_state);
1375         ASSERT(refcnt == 0 || datacnt > 0);
1376         ASSERT(!GHOST_STATE(new_state) || datacnt == 0);
1377         ASSERT(old_state != arc_anon || datacnt <= 1);
1378
1379         from_delta = to_delta = datacnt * hdr->b_size;
1380
1381         /*
1382          * If this buffer is evictable, transfer it from the
1383          * old state list to the new state list.
1384          */
1385         if (refcnt == 0) {
1386                 if (old_state != arc_anon && old_state != arc_l2c_only) {
1387                         uint64_t *size = &old_state->arcs_lsize[buftype];
1388
1389                         ASSERT(HDR_HAS_L1HDR(hdr));
1390                         multilist_remove(&old_state->arcs_list[buftype], hdr);
1391
1392                         /*
1393                          * If prefetching out of the ghost cache,
1394                          * we will have a non-zero datacnt.
1395                          */
1396                         if (GHOST_STATE(old_state) && datacnt == 0) {
1397                                 /* ghost elements have a ghost size */
1398                                 ASSERT(hdr->b_l1hdr.b_buf == NULL);
1399                                 from_delta = hdr->b_size;
1400                         }
1401                         ASSERT3U(*size, >=, from_delta);
1402                         atomic_add_64(size, -from_delta);
1403                 }
1404                 if (new_state != arc_anon && new_state != arc_l2c_only) {
1405                         uint64_t *size = &new_state->arcs_lsize[buftype];
1406
1407                         /*
1408                          * An L1 header always exists here, since if we're
1409                          * moving to some L1-cached state (i.e. not l2c_only or
1410                          * anonymous), we realloc the header to add an L1hdr
1411                          * beforehand.
1412                          */
1413                         ASSERT(HDR_HAS_L1HDR(hdr));
1414                         multilist_insert(&new_state->arcs_list[buftype], hdr);
1415
1416                         /* ghost elements have a ghost size */
1417                         if (GHOST_STATE(new_state)) {
1418                                 ASSERT0(datacnt);
1419                                 ASSERT(hdr->b_l1hdr.b_buf == NULL);
1420                                 to_delta = hdr->b_size;
1421                         }
1422                         atomic_add_64(size, to_delta);
1423                 }
1424         }
1425
1426         ASSERT(!BUF_EMPTY(hdr));
1427         if (new_state == arc_anon && HDR_IN_HASH_TABLE(hdr))
1428                 buf_hash_remove(hdr);
1429
1430         /* adjust state sizes (ignore arc_l2c_only) */
1431         if (to_delta && new_state != arc_l2c_only)
1432                 atomic_add_64(&new_state->arcs_size, to_delta);
1433         if (from_delta && old_state != arc_l2c_only) {
1434                 ASSERT3U(old_state->arcs_size, >=, from_delta);
1435                 atomic_add_64(&old_state->arcs_size, -from_delta);
1436         }
1437         if (HDR_HAS_L1HDR(hdr))
1438                 hdr->b_l1hdr.b_state = new_state;
1439
1440         /*
1441          * L2 headers should never be on the L2 state list since they don't
1442          * have L1 headers allocated.
1443          */
1444         ASSERT(multilist_is_empty(&arc_l2c_only->arcs_list[ARC_BUFC_DATA]) &&
1445             multilist_is_empty(&arc_l2c_only->arcs_list[ARC_BUFC_METADATA]));
1446 }
1447
1448 void
1449 arc_space_consume(uint64_t space, arc_space_type_t type)
1450 {
1451         ASSERT(type >= 0 && type < ARC_SPACE_NUMTYPES);
1452
1453         switch (type) {
1454         default:
1455                 break;
1456         case ARC_SPACE_DATA:
1457                 ARCSTAT_INCR(arcstat_data_size, space);
1458                 break;
1459         case ARC_SPACE_META:
1460                 ARCSTAT_INCR(arcstat_meta_size, space);
1461                 break;
1462         case ARC_SPACE_OTHER:
1463                 ARCSTAT_INCR(arcstat_other_size, space);
1464                 break;
1465         case ARC_SPACE_HDRS:
1466                 ARCSTAT_INCR(arcstat_hdr_size, space);
1467                 break;
1468         case ARC_SPACE_L2HDRS:
1469                 ARCSTAT_INCR(arcstat_l2_hdr_size, space);
1470                 break;
1471         }
1472
1473         if (type != ARC_SPACE_DATA) {
1474                 ARCSTAT_INCR(arcstat_meta_used, space);
1475                 if (arc_meta_max < arc_meta_used)
1476                         arc_meta_max = arc_meta_used;
1477         }
1478
1479         atomic_add_64(&arc_size, space);
1480 }
1481
1482 void
1483 arc_space_return(uint64_t space, arc_space_type_t type)
1484 {
1485         ASSERT(type >= 0 && type < ARC_SPACE_NUMTYPES);
1486
1487         switch (type) {
1488         default:
1489                 break;
1490         case ARC_SPACE_DATA:
1491                 ARCSTAT_INCR(arcstat_data_size, -space);
1492                 break;
1493         case ARC_SPACE_META:
1494                 ARCSTAT_INCR(arcstat_meta_size, -space);
1495                 break;
1496         case ARC_SPACE_OTHER:
1497                 ARCSTAT_INCR(arcstat_other_size, -space);
1498                 break;
1499         case ARC_SPACE_HDRS:
1500                 ARCSTAT_INCR(arcstat_hdr_size, -space);
1501                 break;
1502         case ARC_SPACE_L2HDRS:
1503                 ARCSTAT_INCR(arcstat_l2_hdr_size, -space);
1504                 break;
1505         }
1506
1507         if (type != ARC_SPACE_DATA) {
1508                 ASSERT(arc_meta_used >= space);
1509                 ARCSTAT_INCR(arcstat_meta_used, -space);
1510         }
1511
1512         ASSERT(arc_size >= space);
1513         atomic_add_64(&arc_size, -space);
1514 }
1515
1516 arc_buf_t *
1517 arc_buf_alloc(spa_t *spa, uint64_t size, void *tag, arc_buf_contents_t type)
1518 {
1519         arc_buf_hdr_t *hdr;
1520         arc_buf_t *buf;
1521
1522         VERIFY3U(size, <=, spa_maxblocksize(spa));
1523         hdr = kmem_cache_alloc(hdr_full_cache, KM_PUSHPAGE);
1524         ASSERT(BUF_EMPTY(hdr));
1525         ASSERT3P(hdr->b_freeze_cksum, ==, NULL);
1526         hdr->b_size = size;
1527         hdr->b_spa = spa_load_guid(spa);
1528         hdr->b_l1hdr.b_mru_hits = 0;
1529         hdr->b_l1hdr.b_mru_ghost_hits = 0;
1530         hdr->b_l1hdr.b_mfu_hits = 0;
1531         hdr->b_l1hdr.b_mfu_ghost_hits = 0;
1532         hdr->b_l1hdr.b_l2_hits = 0;
1533
1534         buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE);
1535         buf->b_hdr = hdr;
1536         buf->b_data = NULL;
1537         buf->b_efunc = NULL;
1538         buf->b_private = NULL;
1539         buf->b_next = NULL;
1540
1541         hdr->b_flags = arc_bufc_to_flags(type);
1542         hdr->b_flags |= ARC_FLAG_HAS_L1HDR;
1543
1544         hdr->b_l1hdr.b_buf = buf;
1545         hdr->b_l1hdr.b_state = arc_anon;
1546         hdr->b_l1hdr.b_arc_access = 0;
1547         hdr->b_l1hdr.b_datacnt = 1;
1548         hdr->b_l1hdr.b_tmp_cdata = NULL;
1549
1550         arc_get_data_buf(buf);
1551
1552         ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
1553         (void) refcount_add(&hdr->b_l1hdr.b_refcnt, tag);
1554
1555         return (buf);
1556 }
1557
1558 static char *arc_onloan_tag = "onloan";
1559
1560 /*
1561  * Loan out an anonymous arc buffer. Loaned buffers are not counted as in
1562  * flight data by arc_tempreserve_space() until they are "returned". Loaned
1563  * buffers must be returned to the arc before they can be used by the DMU or
1564  * freed.
1565  */
1566 arc_buf_t *
1567 arc_loan_buf(spa_t *spa, uint64_t size)
1568 {
1569         arc_buf_t *buf;
1570
1571         buf = arc_buf_alloc(spa, size, arc_onloan_tag, ARC_BUFC_DATA);
1572
1573         atomic_add_64(&arc_loaned_bytes, size);
1574         return (buf);
1575 }
1576
1577 /*
1578  * Return a loaned arc buffer to the arc.
1579  */
1580 void
1581 arc_return_buf(arc_buf_t *buf, void *tag)
1582 {
1583         arc_buf_hdr_t *hdr = buf->b_hdr;
1584
1585         ASSERT(buf->b_data != NULL);
1586         ASSERT(HDR_HAS_L1HDR(hdr));
1587         (void) refcount_add(&hdr->b_l1hdr.b_refcnt, tag);
1588         (void) refcount_remove(&hdr->b_l1hdr.b_refcnt, arc_onloan_tag);
1589
1590         atomic_add_64(&arc_loaned_bytes, -hdr->b_size);
1591 }
1592
1593 /* Detach an arc_buf from a dbuf (tag) */
1594 void
1595 arc_loan_inuse_buf(arc_buf_t *buf, void *tag)
1596 {
1597         arc_buf_hdr_t *hdr = buf->b_hdr;
1598
1599         ASSERT(buf->b_data != NULL);
1600         ASSERT(HDR_HAS_L1HDR(hdr));
1601         (void) refcount_add(&hdr->b_l1hdr.b_refcnt, arc_onloan_tag);
1602         (void) refcount_remove(&hdr->b_l1hdr.b_refcnt, tag);
1603         buf->b_efunc = NULL;
1604         buf->b_private = NULL;
1605
1606         atomic_add_64(&arc_loaned_bytes, hdr->b_size);
1607 }
1608
1609 static arc_buf_t *
1610 arc_buf_clone(arc_buf_t *from)
1611 {
1612         arc_buf_t *buf;
1613         arc_buf_hdr_t *hdr = from->b_hdr;
1614         uint64_t size = hdr->b_size;
1615
1616         ASSERT(HDR_HAS_L1HDR(hdr));
1617         ASSERT(hdr->b_l1hdr.b_state != arc_anon);
1618
1619         buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE);
1620         buf->b_hdr = hdr;
1621         buf->b_data = NULL;
1622         buf->b_efunc = NULL;
1623         buf->b_private = NULL;
1624         buf->b_next = hdr->b_l1hdr.b_buf;
1625         hdr->b_l1hdr.b_buf = buf;
1626         arc_get_data_buf(buf);
1627         bcopy(from->b_data, buf->b_data, size);
1628
1629         /*
1630          * This buffer already exists in the arc so create a duplicate
1631          * copy for the caller.  If the buffer is associated with user data
1632          * then track the size and number of duplicates.  These stats will be
1633          * updated as duplicate buffers are created and destroyed.
1634          */
1635         if (HDR_ISTYPE_DATA(hdr)) {
1636                 ARCSTAT_BUMP(arcstat_duplicate_buffers);
1637                 ARCSTAT_INCR(arcstat_duplicate_buffers_size, size);
1638         }
1639         hdr->b_l1hdr.b_datacnt += 1;
1640         return (buf);
1641 }
1642
1643 void
1644 arc_buf_add_ref(arc_buf_t *buf, void* tag)
1645 {
1646         arc_buf_hdr_t *hdr;
1647         kmutex_t *hash_lock;
1648
1649         /*
1650          * Check to see if this buffer is evicted.  Callers
1651          * must verify b_data != NULL to know if the add_ref
1652          * was successful.
1653          */
1654         mutex_enter(&buf->b_evict_lock);
1655         if (buf->b_data == NULL) {
1656                 mutex_exit(&buf->b_evict_lock);
1657                 return;
1658         }
1659         hash_lock = HDR_LOCK(buf->b_hdr);
1660         mutex_enter(hash_lock);
1661         hdr = buf->b_hdr;
1662         ASSERT(HDR_HAS_L1HDR(hdr));
1663         ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
1664         mutex_exit(&buf->b_evict_lock);
1665
1666         ASSERT(hdr->b_l1hdr.b_state == arc_mru ||
1667             hdr->b_l1hdr.b_state == arc_mfu);
1668
1669         add_reference(hdr, hash_lock, tag);
1670         DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr);
1671         arc_access(hdr, hash_lock);
1672         mutex_exit(hash_lock);
1673         ARCSTAT_BUMP(arcstat_hits);
1674         ARCSTAT_CONDSTAT(!HDR_PREFETCH(hdr),
1675             demand, prefetch, !HDR_ISTYPE_METADATA(hdr),
1676             data, metadata, hits);
1677 }
1678
1679 static void
1680 arc_buf_free_on_write(void *data, size_t size,
1681     void (*free_func)(void *, size_t))
1682 {
1683         l2arc_data_free_t *df;
1684
1685         df = kmem_alloc(sizeof (*df), KM_SLEEP);
1686         df->l2df_data = data;
1687         df->l2df_size = size;
1688         df->l2df_func = free_func;
1689         mutex_enter(&l2arc_free_on_write_mtx);
1690         list_insert_head(l2arc_free_on_write, df);
1691         mutex_exit(&l2arc_free_on_write_mtx);
1692 }
1693
1694 /*
1695  * Free the arc data buffer.  If it is an l2arc write in progress,
1696  * the buffer is placed on l2arc_free_on_write to be freed later.
1697  */
1698 static void
1699 arc_buf_data_free(arc_buf_t *buf, void (*free_func)(void *, size_t))
1700 {
1701         arc_buf_hdr_t *hdr = buf->b_hdr;
1702
1703         if (HDR_L2_WRITING(hdr)) {
1704                 arc_buf_free_on_write(buf->b_data, hdr->b_size, free_func);
1705                 ARCSTAT_BUMP(arcstat_l2_free_on_write);
1706         } else {
1707                 free_func(buf->b_data, hdr->b_size);
1708         }
1709 }
1710
1711 static void
1712 arc_buf_l2_cdata_free(arc_buf_hdr_t *hdr)
1713 {
1714         ASSERT(HDR_HAS_L2HDR(hdr));
1715         ASSERT(MUTEX_HELD(&hdr->b_l2hdr.b_dev->l2ad_mtx));
1716
1717         /*
1718          * The b_tmp_cdata field is linked off of the b_l1hdr, so if
1719          * that doesn't exist, the header is in the arc_l2c_only state,
1720          * and there isn't anything to free (it's already been freed).
1721          */
1722         if (!HDR_HAS_L1HDR(hdr))
1723                 return;
1724
1725         /*
1726          * The header isn't being written to the l2arc device, thus it
1727          * shouldn't have a b_tmp_cdata to free.
1728          */
1729         if (!HDR_L2_WRITING(hdr)) {
1730                 ASSERT3P(hdr->b_l1hdr.b_tmp_cdata, ==, NULL);
1731                 return;
1732         }
1733
1734         /*
1735          * The header does not have compression enabled. This can be due
1736          * to the buffer not being compressible, or because we're
1737          * freeing the buffer before the second phase of
1738          * l2arc_write_buffer() has started (which does the compression
1739          * step). In either case, b_tmp_cdata does not point to a
1740          * separately compressed buffer, so there's nothing to free (it
1741          * points to the same buffer as the arc_buf_t's b_data field).
1742          */
1743         if (HDR_GET_COMPRESS(hdr) == ZIO_COMPRESS_OFF) {
1744                 hdr->b_l1hdr.b_tmp_cdata = NULL;
1745                 return;
1746         }
1747
1748         /*
1749          * There's nothing to free since the buffer was all zero's and
1750          * compressed to a zero length buffer.
1751          */
1752         if (HDR_GET_COMPRESS(hdr) == ZIO_COMPRESS_EMPTY) {
1753                 ASSERT3P(hdr->b_l1hdr.b_tmp_cdata, ==, NULL);
1754                 return;
1755         }
1756
1757         ASSERT(L2ARC_IS_VALID_COMPRESS(HDR_GET_COMPRESS(hdr)));
1758
1759         arc_buf_free_on_write(hdr->b_l1hdr.b_tmp_cdata,
1760             hdr->b_size, zio_data_buf_free);
1761
1762         ARCSTAT_BUMP(arcstat_l2_cdata_free_on_write);
1763         hdr->b_l1hdr.b_tmp_cdata = NULL;
1764 }
1765
1766 /*
1767  * Free up buf->b_data and if 'remove' is set, then pull the
1768  * arc_buf_t off of the the arc_buf_hdr_t's list and free it.
1769  */
1770 static void
1771 arc_buf_destroy(arc_buf_t *buf, boolean_t remove)
1772 {
1773         arc_buf_t **bufp;
1774
1775         /* free up data associated with the buf */
1776         if (buf->b_data != NULL) {
1777                 arc_state_t *state = buf->b_hdr->b_l1hdr.b_state;
1778                 uint64_t size = buf->b_hdr->b_size;
1779                 arc_buf_contents_t type = arc_buf_type(buf->b_hdr);
1780
1781                 arc_cksum_verify(buf);
1782                 arc_buf_unwatch(buf);
1783
1784                 if (type == ARC_BUFC_METADATA) {
1785                         arc_buf_data_free(buf, zio_buf_free);
1786                         arc_space_return(size, ARC_SPACE_META);
1787                 } else {
1788                         ASSERT(type == ARC_BUFC_DATA);
1789                         arc_buf_data_free(buf, zio_data_buf_free);
1790                         arc_space_return(size, ARC_SPACE_DATA);
1791                 }
1792
1793                 /* protected by hash lock, if in the hash table */
1794                 if (multilist_link_active(&buf->b_hdr->b_l1hdr.b_arc_node)) {
1795                         uint64_t *cnt = &state->arcs_lsize[type];
1796
1797                         ASSERT(refcount_is_zero(
1798                             &buf->b_hdr->b_l1hdr.b_refcnt));
1799                         ASSERT(state != arc_anon && state != arc_l2c_only);
1800
1801                         ASSERT3U(*cnt, >=, size);
1802                         atomic_add_64(cnt, -size);
1803                 }
1804                 ASSERT3U(state->arcs_size, >=, size);
1805                 atomic_add_64(&state->arcs_size, -size);
1806                 buf->b_data = NULL;
1807
1808                 /*
1809                  * If we're destroying a duplicate buffer make sure
1810                  * that the appropriate statistics are updated.
1811                  */
1812                 if (buf->b_hdr->b_l1hdr.b_datacnt > 1 &&
1813                     HDR_ISTYPE_DATA(buf->b_hdr)) {
1814                         ARCSTAT_BUMPDOWN(arcstat_duplicate_buffers);
1815                         ARCSTAT_INCR(arcstat_duplicate_buffers_size, -size);
1816                 }
1817                 ASSERT(buf->b_hdr->b_l1hdr.b_datacnt > 0);
1818                 buf->b_hdr->b_l1hdr.b_datacnt -= 1;
1819         }
1820
1821         /* only remove the buf if requested */
1822         if (!remove)
1823                 return;
1824
1825         /* remove the buf from the hdr list */
1826         for (bufp = &buf->b_hdr->b_l1hdr.b_buf; *bufp != buf;
1827             bufp = &(*bufp)->b_next)
1828                 continue;
1829         *bufp = buf->b_next;
1830         buf->b_next = NULL;
1831
1832         ASSERT(buf->b_efunc == NULL);
1833
1834         /* clean up the buf */
1835         buf->b_hdr = NULL;
1836         kmem_cache_free(buf_cache, buf);
1837 }
1838
1839 static void
1840 arc_hdr_destroy(arc_buf_hdr_t *hdr)
1841 {
1842         if (HDR_HAS_L1HDR(hdr)) {
1843                 ASSERT(hdr->b_l1hdr.b_buf == NULL ||
1844                     hdr->b_l1hdr.b_datacnt > 0);
1845                 ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
1846                 ASSERT3P(hdr->b_l1hdr.b_state, ==, arc_anon);
1847         }
1848         ASSERT(!HDR_IO_IN_PROGRESS(hdr));
1849         ASSERT(!HDR_IN_HASH_TABLE(hdr));
1850
1851         if (HDR_HAS_L2HDR(hdr)) {
1852                 l2arc_buf_hdr_t *l2hdr = &hdr->b_l2hdr;
1853                 boolean_t buflist_held = MUTEX_HELD(&l2hdr->b_dev->l2ad_mtx);
1854
1855                 if (!buflist_held) {
1856                         mutex_enter(&l2hdr->b_dev->l2ad_mtx);
1857                         l2hdr = &hdr->b_l2hdr;
1858                 }
1859
1860                 list_remove(&l2hdr->b_dev->l2ad_buflist, hdr);
1861
1862                 /*
1863                  * We don't want to leak the b_tmp_cdata buffer that was
1864                  * allocated in l2arc_write_buffers()
1865                  */
1866                 arc_buf_l2_cdata_free(hdr);
1867
1868                 arc_space_return(HDR_L2ONLY_SIZE, ARC_SPACE_L2HDRS);
1869                 ARCSTAT_INCR(arcstat_l2_size, -hdr->b_size);
1870                 ARCSTAT_INCR(arcstat_l2_asize, -l2hdr->b_asize);
1871
1872                 if (!buflist_held)
1873                         mutex_exit(&l2hdr->b_dev->l2ad_mtx);
1874
1875                 hdr->b_flags &= ~ARC_FLAG_HAS_L2HDR;
1876         }
1877
1878         if (!BUF_EMPTY(hdr))
1879                 buf_discard_identity(hdr);
1880
1881         if (hdr->b_freeze_cksum != NULL) {
1882                 kmem_free(hdr->b_freeze_cksum, sizeof (zio_cksum_t));
1883                 hdr->b_freeze_cksum = NULL;
1884         }
1885
1886         if (HDR_HAS_L1HDR(hdr)) {
1887                 while (hdr->b_l1hdr.b_buf) {
1888                         arc_buf_t *buf = hdr->b_l1hdr.b_buf;
1889
1890                         if (buf->b_efunc != NULL) {
1891                                 mutex_enter(&arc_user_evicts_lock);
1892                                 mutex_enter(&buf->b_evict_lock);
1893                                 ASSERT(buf->b_hdr != NULL);
1894                                 arc_buf_destroy(hdr->b_l1hdr.b_buf, FALSE);
1895                                 hdr->b_l1hdr.b_buf = buf->b_next;
1896                                 buf->b_hdr = &arc_eviction_hdr;
1897                                 buf->b_next = arc_eviction_list;
1898                                 arc_eviction_list = buf;
1899                                 mutex_exit(&buf->b_evict_lock);
1900                                 cv_signal(&arc_user_evicts_cv);
1901                                 mutex_exit(&arc_user_evicts_lock);
1902                         } else {
1903                                 arc_buf_destroy(hdr->b_l1hdr.b_buf, TRUE);
1904                         }
1905                 }
1906         }
1907
1908         ASSERT3P(hdr->b_hash_next, ==, NULL);
1909         if (HDR_HAS_L1HDR(hdr)) {
1910                 ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node));
1911                 ASSERT3P(hdr->b_l1hdr.b_acb, ==, NULL);
1912                 kmem_cache_free(hdr_full_cache, hdr);
1913         } else {
1914                 kmem_cache_free(hdr_l2only_cache, hdr);
1915         }
1916 }
1917
1918 void
1919 arc_buf_free(arc_buf_t *buf, void *tag)
1920 {
1921         arc_buf_hdr_t *hdr = buf->b_hdr;
1922         int hashed = hdr->b_l1hdr.b_state != arc_anon;
1923
1924         ASSERT(buf->b_efunc == NULL);
1925         ASSERT(buf->b_data != NULL);
1926
1927         if (hashed) {
1928                 kmutex_t *hash_lock = HDR_LOCK(hdr);
1929
1930                 mutex_enter(hash_lock);
1931                 hdr = buf->b_hdr;
1932                 ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
1933
1934                 (void) remove_reference(hdr, hash_lock, tag);
1935                 if (hdr->b_l1hdr.b_datacnt > 1) {
1936                         arc_buf_destroy(buf, TRUE);
1937                 } else {
1938                         ASSERT(buf == hdr->b_l1hdr.b_buf);
1939                         ASSERT(buf->b_efunc == NULL);
1940                         hdr->b_flags |= ARC_FLAG_BUF_AVAILABLE;
1941                 }
1942                 mutex_exit(hash_lock);
1943         } else if (HDR_IO_IN_PROGRESS(hdr)) {
1944                 int destroy_hdr;
1945                 /*
1946                  * We are in the middle of an async write.  Don't destroy
1947                  * this buffer unless the write completes before we finish
1948                  * decrementing the reference count.
1949                  */
1950                 mutex_enter(&arc_user_evicts_lock);
1951                 (void) remove_reference(hdr, NULL, tag);
1952                 ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
1953                 destroy_hdr = !HDR_IO_IN_PROGRESS(hdr);
1954                 mutex_exit(&arc_user_evicts_lock);
1955                 if (destroy_hdr)
1956                         arc_hdr_destroy(hdr);
1957         } else {
1958                 if (remove_reference(hdr, NULL, tag) > 0)
1959                         arc_buf_destroy(buf, TRUE);
1960                 else
1961                         arc_hdr_destroy(hdr);
1962         }
1963 }
1964
1965 boolean_t
1966 arc_buf_remove_ref(arc_buf_t *buf, void* tag)
1967 {
1968         arc_buf_hdr_t *hdr = buf->b_hdr;
1969         kmutex_t *hash_lock = NULL;
1970         boolean_t no_callback = (buf->b_efunc == NULL);
1971
1972         if (hdr->b_l1hdr.b_state == arc_anon) {
1973                 ASSERT(hdr->b_l1hdr.b_datacnt == 1);
1974                 arc_buf_free(buf, tag);
1975                 return (no_callback);
1976         }
1977
1978         hash_lock = HDR_LOCK(hdr);
1979         mutex_enter(hash_lock);
1980         hdr = buf->b_hdr;
1981         ASSERT(hdr->b_l1hdr.b_datacnt > 0);
1982         ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
1983         ASSERT(hdr->b_l1hdr.b_state != arc_anon);
1984         ASSERT(buf->b_data != NULL);
1985
1986         (void) remove_reference(hdr, hash_lock, tag);
1987         if (hdr->b_l1hdr.b_datacnt > 1) {
1988                 if (no_callback)
1989                         arc_buf_destroy(buf, TRUE);
1990         } else if (no_callback) {
1991                 ASSERT(hdr->b_l1hdr.b_buf == buf && buf->b_next == NULL);
1992                 ASSERT(buf->b_efunc == NULL);
1993                 hdr->b_flags |= ARC_FLAG_BUF_AVAILABLE;
1994         }
1995         ASSERT(no_callback || hdr->b_l1hdr.b_datacnt > 1 ||
1996             refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
1997         mutex_exit(hash_lock);
1998         return (no_callback);
1999 }
2000
2001 uint64_t
2002 arc_buf_size(arc_buf_t *buf)
2003 {
2004         return (buf->b_hdr->b_size);
2005 }
2006
2007 /*
2008  * Called from the DMU to determine if the current buffer should be
2009  * evicted. In order to ensure proper locking, the eviction must be initiated
2010  * from the DMU. Return true if the buffer is associated with user data and
2011  * duplicate buffers still exist.
2012  */
2013 boolean_t
2014 arc_buf_eviction_needed(arc_buf_t *buf)
2015 {
2016         arc_buf_hdr_t *hdr;
2017         boolean_t evict_needed = B_FALSE;
2018
2019         if (zfs_disable_dup_eviction)
2020                 return (B_FALSE);
2021
2022         mutex_enter(&buf->b_evict_lock);
2023         hdr = buf->b_hdr;
2024         if (hdr == NULL) {
2025                 /*
2026                  * We are in arc_do_user_evicts(); let that function
2027                  * perform the eviction.
2028                  */
2029                 ASSERT(buf->b_data == NULL);
2030                 mutex_exit(&buf->b_evict_lock);
2031                 return (B_FALSE);
2032         } else if (buf->b_data == NULL) {
2033                 /*
2034                  * We have already been added to the arc eviction list;
2035                  * recommend eviction.
2036                  */
2037                 ASSERT3P(hdr, ==, &arc_eviction_hdr);
2038                 mutex_exit(&buf->b_evict_lock);
2039                 return (B_TRUE);
2040         }
2041
2042         if (hdr->b_l1hdr.b_datacnt > 1 && HDR_ISTYPE_DATA(hdr))
2043                 evict_needed = B_TRUE;
2044
2045         mutex_exit(&buf->b_evict_lock);
2046         return (evict_needed);
2047 }
2048
2049 /*
2050  * Evict the arc_buf_hdr that is provided as a parameter. The resultant
2051  * state of the header is dependent on its state prior to entering this
2052  * function. The following transitions are possible:
2053  *
2054  *    - arc_mru -> arc_mru_ghost
2055  *    - arc_mfu -> arc_mfu_ghost
2056  *    - arc_mru_ghost -> arc_l2c_only
2057  *    - arc_mru_ghost -> deleted
2058  *    - arc_mfu_ghost -> arc_l2c_only
2059  *    - arc_mfu_ghost -> deleted
2060  */
2061 static int64_t
2062 arc_evict_hdr(arc_buf_hdr_t *hdr, kmutex_t *hash_lock)
2063 {
2064         arc_state_t *evicted_state, *state;
2065         int64_t bytes_evicted = 0;
2066
2067         ASSERT(MUTEX_HELD(hash_lock));
2068         ASSERT(HDR_HAS_L1HDR(hdr));
2069
2070         state = hdr->b_l1hdr.b_state;
2071         if (GHOST_STATE(state)) {
2072                 ASSERT(!HDR_IO_IN_PROGRESS(hdr));
2073                 ASSERT(hdr->b_l1hdr.b_buf == NULL);
2074
2075                 /*
2076                  * l2arc_write_buffers() relies on a header's L1 portion
2077                  * (i.e. its b_tmp_cdata field) during its write phase.
2078                  * Thus, we cannot push a header onto the arc_l2c_only
2079                  * state (removing its L1 piece) until the header is
2080                  * done being written to the l2arc.
2081                  */
2082                 if (HDR_HAS_L2HDR(hdr) && HDR_L2_WRITING(hdr)) {
2083                         ARCSTAT_BUMP(arcstat_evict_l2_skip);
2084                         return (bytes_evicted);
2085                 }
2086
2087                 ARCSTAT_BUMP(arcstat_deleted);
2088                 bytes_evicted += hdr->b_size;
2089
2090                 DTRACE_PROBE1(arc__delete, arc_buf_hdr_t *, hdr);
2091
2092                 if (HDR_HAS_L2HDR(hdr)) {
2093                         /*
2094                          * This buffer is cached on the 2nd Level ARC;
2095                          * don't destroy the header.
2096                          */
2097                         arc_change_state(arc_l2c_only, hdr, hash_lock);
2098                         /*
2099                          * dropping from L1+L2 cached to L2-only,
2100                          * realloc to remove the L1 header.
2101                          */
2102                         hdr = arc_hdr_realloc(hdr, hdr_full_cache,
2103                             hdr_l2only_cache);
2104                 } else {
2105                         arc_change_state(arc_anon, hdr, hash_lock);
2106                         arc_hdr_destroy(hdr);
2107                 }
2108                 return (bytes_evicted);
2109         }
2110
2111         ASSERT(state == arc_mru || state == arc_mfu);
2112         evicted_state = (state == arc_mru) ? arc_mru_ghost : arc_mfu_ghost;
2113
2114         /* prefetch buffers have a minimum lifespan */
2115         if (HDR_IO_IN_PROGRESS(hdr) ||
2116             ((hdr->b_flags & (ARC_FLAG_PREFETCH | ARC_FLAG_INDIRECT)) &&
2117             ddi_get_lbolt() - hdr->b_l1hdr.b_arc_access <
2118             arc_min_prefetch_lifespan)) {
2119                 ARCSTAT_BUMP(arcstat_evict_skip);
2120                 return (bytes_evicted);
2121         }
2122
2123         ASSERT0(refcount_count(&hdr->b_l1hdr.b_refcnt));
2124         ASSERT3U(hdr->b_l1hdr.b_datacnt, >, 0);
2125         while (hdr->b_l1hdr.b_buf) {
2126                 arc_buf_t *buf = hdr->b_l1hdr.b_buf;
2127                 if (!mutex_tryenter(&buf->b_evict_lock)) {
2128                         ARCSTAT_BUMP(arcstat_mutex_miss);
2129                         break;
2130                 }
2131                 if (buf->b_data != NULL)
2132                         bytes_evicted += hdr->b_size;
2133                 if (buf->b_efunc != NULL) {
2134                         mutex_enter(&arc_user_evicts_lock);
2135                         arc_buf_destroy(buf, FALSE);
2136                         hdr->b_l1hdr.b_buf = buf->b_next;
2137                         buf->b_hdr = &arc_eviction_hdr;
2138                         buf->b_next = arc_eviction_list;
2139                         arc_eviction_list = buf;
2140                         cv_signal(&arc_user_evicts_cv);
2141                         mutex_exit(&arc_user_evicts_lock);
2142                         mutex_exit(&buf->b_evict_lock);
2143                 } else {
2144                         mutex_exit(&buf->b_evict_lock);
2145                         arc_buf_destroy(buf, TRUE);
2146                 }
2147         }
2148
2149         if (HDR_HAS_L2HDR(hdr)) {
2150                 ARCSTAT_INCR(arcstat_evict_l2_cached, hdr->b_size);
2151         } else {
2152                 if (l2arc_write_eligible(hdr->b_spa, hdr))
2153                         ARCSTAT_INCR(arcstat_evict_l2_eligible, hdr->b_size);
2154                 else
2155                         ARCSTAT_INCR(arcstat_evict_l2_ineligible, hdr->b_size);
2156         }
2157
2158         if (hdr->b_l1hdr.b_datacnt == 0) {
2159                 arc_change_state(evicted_state, hdr, hash_lock);
2160                 ASSERT(HDR_IN_HASH_TABLE(hdr));
2161                 hdr->b_flags |= ARC_FLAG_IN_HASH_TABLE;
2162                 hdr->b_flags &= ~ARC_FLAG_BUF_AVAILABLE;
2163                 DTRACE_PROBE1(arc__evict, arc_buf_hdr_t *, hdr);
2164         }
2165
2166         return (bytes_evicted);
2167 }
2168
2169 static uint64_t
2170 arc_evict_state_impl(multilist_t *ml, int idx, arc_buf_hdr_t *marker,
2171     uint64_t spa, int64_t bytes)
2172 {
2173         multilist_sublist_t *mls;
2174         uint64_t bytes_evicted = 0;
2175         arc_buf_hdr_t *hdr;
2176         kmutex_t *hash_lock;
2177         int evict_count = 0;
2178
2179         ASSERT3P(marker, !=, NULL);
2180         ASSERTV(if (bytes < 0) ASSERT(bytes == ARC_EVICT_ALL));
2181
2182         mls = multilist_sublist_lock(ml, idx);
2183
2184         for (hdr = multilist_sublist_prev(mls, marker); hdr != NULL;
2185             hdr = multilist_sublist_prev(mls, marker)) {
2186                 if ((bytes != ARC_EVICT_ALL && bytes_evicted >= bytes) ||
2187                     (evict_count >= zfs_arc_evict_batch_limit))
2188                         break;
2189
2190                 /*
2191                  * To keep our iteration location, move the marker
2192                  * forward. Since we're not holding hdr's hash lock, we
2193                  * must be very careful and not remove 'hdr' from the
2194                  * sublist. Otherwise, other consumers might mistake the
2195                  * 'hdr' as not being on a sublist when they call the
2196                  * multilist_link_active() function (they all rely on
2197                  * the hash lock protecting concurrent insertions and
2198                  * removals). multilist_sublist_move_forward() was
2199                  * specifically implemented to ensure this is the case
2200                  * (only 'marker' will be removed and re-inserted).
2201                  */
2202                 multilist_sublist_move_forward(mls, marker);
2203
2204                 /*
2205                  * The only case where the b_spa field should ever be
2206                  * zero, is the marker headers inserted by
2207                  * arc_evict_state(). It's possible for multiple threads
2208                  * to be calling arc_evict_state() concurrently (e.g.
2209                  * dsl_pool_close() and zio_inject_fault()), so we must
2210                  * skip any markers we see from these other threads.
2211                  */
2212                 if (hdr->b_spa == 0)
2213                         continue;
2214
2215                 /* we're only interested in evicting buffers of a certain spa */
2216                 if (spa != 0 && hdr->b_spa != spa) {
2217                         ARCSTAT_BUMP(arcstat_evict_skip);
2218                         continue;
2219                 }
2220
2221                 hash_lock = HDR_LOCK(hdr);
2222
2223                 /*
2224                  * We aren't calling this function from any code path
2225                  * that would already be holding a hash lock, so we're
2226                  * asserting on this assumption to be defensive in case
2227                  * this ever changes. Without this check, it would be
2228                  * possible to incorrectly increment arcstat_mutex_miss
2229                  * below (e.g. if the code changed such that we called
2230                  * this function with a hash lock held).
2231                  */
2232                 ASSERT(!MUTEX_HELD(hash_lock));
2233
2234                 if (mutex_tryenter(hash_lock)) {
2235                         uint64_t evicted = arc_evict_hdr(hdr, hash_lock);
2236                         mutex_exit(hash_lock);
2237
2238                         bytes_evicted += evicted;
2239
2240                         /*
2241                          * If evicted is zero, arc_evict_hdr() must have
2242                          * decided to skip this header, don't increment
2243                          * evict_count in this case.
2244                          */
2245                         if (evicted != 0)
2246                                 evict_count++;
2247
2248                         /*
2249                          * If arc_size isn't overflowing, signal any
2250                          * threads that might happen to be waiting.
2251                          *
2252                          * For each header evicted, we wake up a single
2253                          * thread. If we used cv_broadcast, we could
2254                          * wake up "too many" threads causing arc_size
2255                          * to significantly overflow arc_c; since
2256                          * arc_get_data_buf() doesn't check for overflow
2257                          * when it's woken up (it doesn't because it's
2258                          * possible for the ARC to be overflowing while
2259                          * full of un-evictable buffers, and the
2260                          * function should proceed in this case).
2261                          *
2262                          * If threads are left sleeping, due to not
2263                          * using cv_broadcast, they will be woken up
2264                          * just before arc_reclaim_thread() sleeps.
2265                          */
2266                         mutex_enter(&arc_reclaim_lock);
2267                         if (!arc_is_overflowing())
2268                                 cv_signal(&arc_reclaim_waiters_cv);
2269                         mutex_exit(&arc_reclaim_lock);
2270                 } else {
2271                         ARCSTAT_BUMP(arcstat_mutex_miss);
2272                 }
2273         }
2274
2275         multilist_sublist_unlock(mls);
2276
2277         return (bytes_evicted);
2278 }
2279
2280 /*
2281  * Evict buffers from the given arc state, until we've removed the
2282  * specified number of bytes. Move the removed buffers to the
2283  * appropriate evict state.
2284  *
2285  * This function makes a "best effort". It skips over any buffers
2286  * it can't get a hash_lock on, and so, may not catch all candidates.
2287  * It may also return without evicting as much space as requested.
2288  *
2289  * If bytes is specified using the special value ARC_EVICT_ALL, this
2290  * will evict all available (i.e. unlocked and evictable) buffers from
2291  * the given arc state; which is used by arc_flush().
2292  */
2293 static uint64_t
2294 arc_evict_state(arc_state_t *state, uint64_t spa, int64_t bytes,
2295     arc_buf_contents_t type)
2296 {
2297         uint64_t total_evicted = 0;
2298         multilist_t *ml = &state->arcs_list[type];
2299         int num_sublists;
2300         arc_buf_hdr_t **markers;
2301         int i;
2302
2303         ASSERTV(if (bytes < 0) ASSERT(bytes == ARC_EVICT_ALL));
2304
2305         num_sublists = multilist_get_num_sublists(ml);
2306
2307         /*
2308          * If we've tried to evict from each sublist, made some
2309          * progress, but still have not hit the target number of bytes
2310          * to evict, we want to keep trying. The markers allow us to
2311          * pick up where we left off for each individual sublist, rather
2312          * than starting from the tail each time.
2313          */
2314         markers = kmem_zalloc(sizeof (*markers) * num_sublists, KM_SLEEP);
2315         for (i = 0; i < num_sublists; i++) {
2316                 multilist_sublist_t *mls;
2317
2318                 markers[i] = kmem_cache_alloc(hdr_full_cache, KM_SLEEP);
2319
2320                 /*
2321                  * A b_spa of 0 is used to indicate that this header is
2322                  * a marker. This fact is used in arc_adjust_type() and
2323                  * arc_evict_state_impl().
2324                  */
2325                 markers[i]->b_spa = 0;
2326
2327                 mls = multilist_sublist_lock(ml, i);
2328                 multilist_sublist_insert_tail(mls, markers[i]);
2329                 multilist_sublist_unlock(mls);
2330         }
2331
2332         /*
2333          * While we haven't hit our target number of bytes to evict, or
2334          * we're evicting all available buffers.
2335          */
2336         while (total_evicted < bytes || bytes == ARC_EVICT_ALL) {
2337                 /*
2338                  * Start eviction using a randomly selected sublist,
2339                  * this is to try and evenly balance eviction across all
2340                  * sublists. Always starting at the same sublist
2341                  * (e.g. index 0) would cause evictions to favor certain
2342                  * sublists over others.
2343                  */
2344                 int sublist_idx = multilist_get_random_index(ml);
2345                 uint64_t scan_evicted = 0;
2346
2347                 for (i = 0; i < num_sublists; i++) {
2348                         uint64_t bytes_remaining;
2349                         uint64_t bytes_evicted;
2350
2351                         if (bytes == ARC_EVICT_ALL)
2352                                 bytes_remaining = ARC_EVICT_ALL;
2353                         else if (total_evicted < bytes)
2354                                 bytes_remaining = bytes - total_evicted;
2355                         else
2356                                 break;
2357
2358                         bytes_evicted = arc_evict_state_impl(ml, sublist_idx,
2359                             markers[sublist_idx], spa, bytes_remaining);
2360
2361                         scan_evicted += bytes_evicted;
2362                         total_evicted += bytes_evicted;
2363
2364                         /* we've reached the end, wrap to the beginning */
2365                         if (++sublist_idx >= num_sublists)
2366                                 sublist_idx = 0;
2367                 }
2368
2369                 /*
2370                  * If we didn't evict anything during this scan, we have
2371                  * no reason to believe we'll evict more during another
2372                  * scan, so break the loop.
2373                  */
2374                 if (scan_evicted == 0) {
2375                         /* This isn't possible, let's make that obvious */
2376                         ASSERT3S(bytes, !=, 0);
2377
2378                         /*
2379                          * When bytes is ARC_EVICT_ALL, the only way to
2380                          * break the loop is when scan_evicted is zero.
2381                          * In that case, we actually have evicted enough,
2382                          * so we don't want to increment the kstat.
2383                          */
2384                         if (bytes != ARC_EVICT_ALL) {
2385                                 ASSERT3S(total_evicted, <, bytes);
2386                                 ARCSTAT_BUMP(arcstat_evict_not_enough);
2387                         }
2388
2389                         break;
2390                 }
2391         }
2392
2393         for (i = 0; i < num_sublists; i++) {
2394                 multilist_sublist_t *mls = multilist_sublist_lock(ml, i);
2395                 multilist_sublist_remove(mls, markers[i]);
2396                 multilist_sublist_unlock(mls);
2397
2398                 kmem_cache_free(hdr_full_cache, markers[i]);
2399         }
2400         kmem_free(markers, sizeof (*markers) * num_sublists);
2401
2402         return (total_evicted);
2403 }
2404
2405 /*
2406  * Flush all "evictable" data of the given type from the arc state
2407  * specified. This will not evict any "active" buffers (i.e. referenced).
2408  *
2409  * When 'retry' is set to FALSE, the function will make a single pass
2410  * over the state and evict any buffers that it can. Since it doesn't
2411  * continually retry the eviction, it might end up leaving some buffers
2412  * in the ARC due to lock misses.
2413  *
2414  * When 'retry' is set to TRUE, the function will continually retry the
2415  * eviction until *all* evictable buffers have been removed from the
2416  * state. As a result, if concurrent insertions into the state are
2417  * allowed (e.g. if the ARC isn't shutting down), this function might
2418  * wind up in an infinite loop, continually trying to evict buffers.
2419  */
2420 static uint64_t
2421 arc_flush_state(arc_state_t *state, uint64_t spa, arc_buf_contents_t type,
2422     boolean_t retry)
2423 {
2424         uint64_t evicted = 0;
2425
2426         while (state->arcs_lsize[type] != 0) {
2427                 evicted += arc_evict_state(state, spa, ARC_EVICT_ALL, type);
2428
2429                 if (!retry)
2430                         break;
2431         }
2432
2433         return (evicted);
2434 }
2435
2436 /*
2437  * Helper function for arc_prune() it is responsible for safely handling
2438  * the execution of a registered arc_prune_func_t.
2439  */
2440 static void
2441 arc_prune_task(void *ptr)
2442 {
2443         arc_prune_t *ap = (arc_prune_t *)ptr;
2444         arc_prune_func_t *func = ap->p_pfunc;
2445
2446         if (func != NULL)
2447                 func(ap->p_adjust, ap->p_private);
2448
2449         /* Callback unregistered concurrently with execution */
2450         if (refcount_remove(&ap->p_refcnt, func) == 0) {
2451                 ASSERT(!list_link_active(&ap->p_node));
2452                 refcount_destroy(&ap->p_refcnt);
2453                 kmem_free(ap, sizeof (*ap));
2454         }
2455 }
2456
2457 /*
2458  * Notify registered consumers they must drop holds on a portion of the ARC
2459  * buffered they reference.  This provides a mechanism to ensure the ARC can
2460  * honor the arc_meta_limit and reclaim otherwise pinned ARC buffers.  This
2461  * is analogous to dnlc_reduce_cache() but more generic.
2462  *
2463  * This operation is performed asyncronously so it may be safely called
2464  * in the context of the arc_adapt_thread().  A reference is taken here
2465  * for each registered arc_prune_t and the arc_prune_task() is responsible
2466  * for releasing it once the registered arc_prune_func_t has completed.
2467  */
2468 static void
2469 arc_prune_async(int64_t adjust)
2470 {
2471         arc_prune_t *ap;
2472
2473         mutex_enter(&arc_prune_mtx);
2474         for (ap = list_head(&arc_prune_list); ap != NULL;
2475             ap = list_next(&arc_prune_list, ap)) {
2476
2477                 if (refcount_count(&ap->p_refcnt) >= 2)
2478                         continue;
2479
2480                 refcount_add(&ap->p_refcnt, ap->p_pfunc);
2481                 ap->p_adjust = adjust;
2482                 taskq_dispatch(arc_prune_taskq, arc_prune_task, ap, TQ_SLEEP);
2483                 ARCSTAT_BUMP(arcstat_prune);
2484         }
2485         mutex_exit(&arc_prune_mtx);
2486 }
2487
2488 static void
2489 arc_prune(int64_t adjust)
2490 {
2491         arc_prune_async(adjust);
2492         taskq_wait_outstanding(arc_prune_taskq, 0);
2493 }
2494
2495 /*
2496  * Evict the specified number of bytes from the state specified,
2497  * restricting eviction to the spa and type given. This function
2498  * prevents us from trying to evict more from a state's list than
2499  * is "evictable", and to skip evicting altogether when passed a
2500  * negative value for "bytes". In contrast, arc_evict_state() will
2501  * evict everything it can, when passed a negative value for "bytes".
2502  */
2503 static uint64_t
2504 arc_adjust_impl(arc_state_t *state, uint64_t spa, int64_t bytes,
2505     arc_buf_contents_t type)
2506 {
2507         int64_t delta;
2508
2509         if (bytes > 0 && state->arcs_lsize[type] > 0) {
2510                 delta = MIN(state->arcs_lsize[type], bytes);
2511                 return (arc_evict_state(state, spa, delta, type));
2512         }
2513
2514         return (0);
2515 }
2516
2517 /*
2518  * The goal of this function is to evict enough meta data buffers from the
2519  * ARC in order to enforce the arc_meta_limit.  Achieving this is slightly
2520  * more complicated than it appears because it is common for data buffers
2521  * to have holds on meta data buffers.  In addition, dnode meta data buffers
2522  * will be held by the dnodes in the block preventing them from being freed.
2523  * This means we can't simply traverse the ARC and expect to always find
2524  * enough unheld meta data buffer to release.
2525  *
2526  * Therefore, this function has been updated to make alternating passes
2527  * over the ARC releasing data buffers and then newly unheld meta data
2528  * buffers.  This ensures forward progress is maintained and arc_meta_used
2529  * will decrease.  Normally this is sufficient, but if required the ARC
2530  * will call the registered prune callbacks causing dentry and inodes to
2531  * be dropped from the VFS cache.  This will make dnode meta data buffers
2532  * available for reclaim.
2533  */
2534 static uint64_t
2535 arc_adjust_meta_balanced(void)
2536 {
2537         int64_t adjustmnt, delta, prune = 0;
2538         uint64_t total_evicted = 0;
2539         arc_buf_contents_t type = ARC_BUFC_DATA;
2540         unsigned long restarts = zfs_arc_meta_adjust_restarts;
2541
2542 restart:
2543         /*
2544          * This slightly differs than the way we evict from the mru in
2545          * arc_adjust because we don't have a "target" value (i.e. no
2546          * "meta" arc_p). As a result, I think we can completely
2547          * cannibalize the metadata in the MRU before we evict the
2548          * metadata from the MFU. I think we probably need to implement a
2549          * "metadata arc_p" value to do this properly.
2550          */
2551         adjustmnt = arc_meta_used - arc_meta_limit;
2552
2553         if (adjustmnt > 0 && arc_mru->arcs_lsize[type] > 0) {
2554                 delta = MIN(arc_mru->arcs_lsize[type], adjustmnt);
2555                 total_evicted += arc_adjust_impl(arc_mru, 0, delta, type);
2556                 adjustmnt -= delta;
2557         }
2558
2559         /*
2560          * We can't afford to recalculate adjustmnt here. If we do,
2561          * new metadata buffers can sneak into the MRU or ANON lists,
2562          * thus penalize the MFU metadata. Although the fudge factor is
2563          * small, it has been empirically shown to be significant for
2564          * certain workloads (e.g. creating many empty directories). As
2565          * such, we use the original calculation for adjustmnt, and
2566          * simply decrement the amount of data evicted from the MRU.
2567          */
2568
2569         if (adjustmnt > 0 && arc_mfu->arcs_lsize[type] > 0) {
2570                 delta = MIN(arc_mfu->arcs_lsize[type], adjustmnt);
2571                 total_evicted += arc_adjust_impl(arc_mfu, 0, delta, type);
2572         }
2573
2574         adjustmnt = arc_meta_used - arc_meta_limit;
2575
2576         if (adjustmnt > 0 && arc_mru_ghost->arcs_lsize[type] > 0) {
2577                 delta = MIN(adjustmnt,
2578                     arc_mru_ghost->arcs_lsize[type]);
2579                 total_evicted += arc_adjust_impl(arc_mru_ghost, 0, delta, type);
2580                 adjustmnt -= delta;
2581         }
2582
2583         if (adjustmnt > 0 && arc_mfu_ghost->arcs_lsize[type] > 0) {
2584                 delta = MIN(adjustmnt,
2585                     arc_mfu_ghost->arcs_lsize[type]);
2586                 total_evicted += arc_adjust_impl(arc_mfu_ghost, 0, delta, type);
2587         }
2588
2589         /*
2590          * If after attempting to make the requested adjustment to the ARC
2591          * the meta limit is still being exceeded then request that the
2592          * higher layers drop some cached objects which have holds on ARC
2593          * meta buffers.  Requests to the upper layers will be made with
2594          * increasingly large scan sizes until the ARC is below the limit.
2595          */
2596         if (arc_meta_used > arc_meta_limit) {
2597                 if (type == ARC_BUFC_DATA) {
2598                         type = ARC_BUFC_METADATA;
2599                 } else {
2600                         type = ARC_BUFC_DATA;
2601
2602                         if (zfs_arc_meta_prune) {
2603                                 prune += zfs_arc_meta_prune;
2604                                 arc_prune_async(prune);
2605                         }
2606                 }
2607
2608                 if (restarts > 0) {
2609                         restarts--;
2610                         goto restart;
2611                 }
2612         }
2613         return (total_evicted);
2614 }
2615
2616 /*
2617  * Evict metadata buffers from the cache, such that arc_meta_used is
2618  * capped by the arc_meta_limit tunable.
2619  */
2620 static uint64_t
2621 arc_adjust_meta_only(void)
2622 {
2623         uint64_t total_evicted = 0;
2624         int64_t target;
2625
2626         /*
2627          * If we're over the meta limit, we want to evict enough
2628          * metadata to get back under the meta limit. We don't want to
2629          * evict so much that we drop the MRU below arc_p, though. If
2630          * we're over the meta limit more than we're over arc_p, we
2631          * evict some from the MRU here, and some from the MFU below.
2632          */
2633         target = MIN((int64_t)(arc_meta_used - arc_meta_limit),
2634             (int64_t)(arc_anon->arcs_size + arc_mru->arcs_size - arc_p));
2635
2636         total_evicted += arc_adjust_impl(arc_mru, 0, target, ARC_BUFC_METADATA);
2637
2638         /*
2639          * Similar to the above, we want to evict enough bytes to get us
2640          * below the meta limit, but not so much as to drop us below the
2641          * space alloted to the MFU (which is defined as arc_c - arc_p).
2642          */
2643         target = MIN((int64_t)(arc_meta_used - arc_meta_limit),
2644             (int64_t)(arc_mfu->arcs_size - (arc_c - arc_p)));
2645
2646         total_evicted += arc_adjust_impl(arc_mfu, 0, target, ARC_BUFC_METADATA);
2647
2648         return (total_evicted);
2649 }
2650
2651 static uint64_t
2652 arc_adjust_meta(void)
2653 {
2654         if (zfs_arc_meta_strategy == ARC_STRATEGY_META_ONLY)
2655                 return (arc_adjust_meta_only());
2656         else
2657                 return (arc_adjust_meta_balanced());
2658 }
2659
2660 /*
2661  * Return the type of the oldest buffer in the given arc state
2662  *
2663  * This function will select a random sublist of type ARC_BUFC_DATA and
2664  * a random sublist of type ARC_BUFC_METADATA. The tail of each sublist
2665  * is compared, and the type which contains the "older" buffer will be
2666  * returned.
2667  */
2668 static arc_buf_contents_t
2669 arc_adjust_type(arc_state_t *state)
2670 {
2671         multilist_t *data_ml = &state->arcs_list[ARC_BUFC_DATA];
2672         multilist_t *meta_ml = &state->arcs_list[ARC_BUFC_METADATA];
2673         int data_idx = multilist_get_random_index(data_ml);
2674         int meta_idx = multilist_get_random_index(meta_ml);
2675         multilist_sublist_t *data_mls;
2676         multilist_sublist_t *meta_mls;
2677         arc_buf_contents_t type;
2678         arc_buf_hdr_t *data_hdr;
2679         arc_buf_hdr_t *meta_hdr;
2680
2681         /*
2682          * We keep the sublist lock until we're finished, to prevent
2683          * the headers from being destroyed via arc_evict_state().
2684          */
2685         data_mls = multilist_sublist_lock(data_ml, data_idx);
2686         meta_mls = multilist_sublist_lock(meta_ml, meta_idx);
2687
2688         /*
2689          * These two loops are to ensure we skip any markers that
2690          * might be at the tail of the lists due to arc_evict_state().
2691          */
2692
2693         for (data_hdr = multilist_sublist_tail(data_mls); data_hdr != NULL;
2694             data_hdr = multilist_sublist_prev(data_mls, data_hdr)) {
2695                 if (data_hdr->b_spa != 0)
2696                         break;
2697         }
2698
2699         for (meta_hdr = multilist_sublist_tail(meta_mls); meta_hdr != NULL;
2700             meta_hdr = multilist_sublist_prev(meta_mls, meta_hdr)) {
2701                 if (meta_hdr->b_spa != 0)
2702                         break;
2703         }
2704
2705         if (data_hdr == NULL && meta_hdr == NULL) {
2706                 type = ARC_BUFC_DATA;
2707         } else if (data_hdr == NULL) {
2708                 ASSERT3P(meta_hdr, !=, NULL);
2709                 type = ARC_BUFC_METADATA;
2710         } else if (meta_hdr == NULL) {
2711                 ASSERT3P(data_hdr, !=, NULL);
2712                 type = ARC_BUFC_DATA;
2713         } else {
2714                 ASSERT3P(data_hdr, !=, NULL);
2715                 ASSERT3P(meta_hdr, !=, NULL);
2716
2717                 /* The headers can't be on the sublist without an L1 header */
2718                 ASSERT(HDR_HAS_L1HDR(data_hdr));
2719                 ASSERT(HDR_HAS_L1HDR(meta_hdr));
2720
2721                 if (data_hdr->b_l1hdr.b_arc_access <
2722                     meta_hdr->b_l1hdr.b_arc_access) {
2723                         type = ARC_BUFC_DATA;
2724                 } else {
2725                         type = ARC_BUFC_METADATA;
2726                 }
2727         }
2728
2729         multilist_sublist_unlock(meta_mls);
2730         multilist_sublist_unlock(data_mls);
2731
2732         return (type);
2733 }
2734
2735 /*
2736  * Evict buffers from the cache, such that arc_size is capped by arc_c.
2737  */
2738 static uint64_t
2739 arc_adjust(void)
2740 {
2741         uint64_t total_evicted = 0;
2742         uint64_t bytes;
2743         int64_t target;
2744
2745         /*
2746          * If we're over arc_meta_limit, we want to correct that before
2747          * potentially evicting data buffers below.
2748          */
2749         total_evicted += arc_adjust_meta();
2750
2751         /*
2752          * Adjust MRU size
2753          *
2754          * If we're over the target cache size, we want to evict enough
2755          * from the list to get back to our target size. We don't want
2756          * to evict too much from the MRU, such that it drops below
2757          * arc_p. So, if we're over our target cache size more than
2758          * the MRU is over arc_p, we'll evict enough to get back to
2759          * arc_p here, and then evict more from the MFU below.
2760          */
2761         target = MIN((int64_t)(arc_size - arc_c),
2762             (int64_t)(arc_anon->arcs_size + arc_mru->arcs_size + arc_meta_used -
2763             arc_p));
2764
2765         /*
2766          * If we're below arc_meta_min, always prefer to evict data.
2767          * Otherwise, try to satisfy the requested number of bytes to
2768          * evict from the type which contains older buffers; in an
2769          * effort to keep newer buffers in the cache regardless of their
2770          * type. If we cannot satisfy the number of bytes from this
2771          * type, spill over into the next type.
2772          */
2773         if (arc_adjust_type(arc_mru) == ARC_BUFC_METADATA &&
2774             arc_meta_used > arc_meta_min) {
2775                 bytes = arc_adjust_impl(arc_mru, 0, target, ARC_BUFC_METADATA);
2776                 total_evicted += bytes;
2777
2778                 /*
2779                  * If we couldn't evict our target number of bytes from
2780                  * metadata, we try to get the rest from data.
2781                  */
2782                 target -= bytes;
2783
2784                 total_evicted +=
2785                     arc_adjust_impl(arc_mru, 0, target, ARC_BUFC_DATA);
2786         } else {
2787                 bytes = arc_adjust_impl(arc_mru, 0, target, ARC_BUFC_DATA);
2788                 total_evicted += bytes;
2789
2790                 /*
2791                  * If we couldn't evict our target number of bytes from
2792                  * data, we try to get the rest from metadata.
2793                  */
2794                 target -= bytes;
2795
2796                 total_evicted +=
2797                     arc_adjust_impl(arc_mru, 0, target, ARC_BUFC_METADATA);
2798         }
2799
2800         /*
2801          * Adjust MFU size
2802          *
2803          * Now that we've tried to evict enough from the MRU to get its
2804          * size back to arc_p, if we're still above the target cache
2805          * size, we evict the rest from the MFU.
2806          */
2807         target = arc_size - arc_c;
2808
2809         if (arc_adjust_type(arc_mru) == ARC_BUFC_METADATA &&
2810             arc_meta_used > arc_meta_min) {
2811                 bytes = arc_adjust_impl(arc_mfu, 0, target, ARC_BUFC_METADATA);
2812                 total_evicted += bytes;
2813
2814                 /*
2815                  * If we couldn't evict our target number of bytes from
2816                  * metadata, we try to get the rest from data.
2817                  */
2818                 target -= bytes;
2819
2820                 total_evicted +=
2821                     arc_adjust_impl(arc_mfu, 0, target, ARC_BUFC_DATA);
2822         } else {
2823                 bytes = arc_adjust_impl(arc_mfu, 0, target, ARC_BUFC_DATA);
2824                 total_evicted += bytes;
2825
2826                 /*
2827                  * If we couldn't evict our target number of bytes from
2828                  * data, we try to get the rest from data.
2829                  */
2830                 target -= bytes;
2831
2832                 total_evicted +=
2833                     arc_adjust_impl(arc_mfu, 0, target, ARC_BUFC_METADATA);
2834         }
2835
2836         /*
2837          * Adjust ghost lists
2838          *
2839          * In addition to the above, the ARC also defines target values
2840          * for the ghost lists. The sum of the mru list and mru ghost
2841          * list should never exceed the target size of the cache, and
2842          * the sum of the mru list, mfu list, mru ghost list, and mfu
2843          * ghost list should never exceed twice the target size of the
2844          * cache. The following logic enforces these limits on the ghost
2845          * caches, and evicts from them as needed.
2846          */
2847         target = arc_mru->arcs_size + arc_mru_ghost->arcs_size - arc_c;
2848
2849         bytes = arc_adjust_impl(arc_mru_ghost, 0, target, ARC_BUFC_DATA);
2850         total_evicted += bytes;
2851
2852         target -= bytes;
2853
2854         total_evicted +=
2855             arc_adjust_impl(arc_mru_ghost, 0, target, ARC_BUFC_METADATA);
2856
2857         /*
2858          * We assume the sum of the mru list and mfu list is less than
2859          * or equal to arc_c (we enforced this above), which means we
2860          * can use the simpler of the two equations below:
2861          *
2862          *      mru + mfu + mru ghost + mfu ghost <= 2 * arc_c
2863          *                  mru ghost + mfu ghost <= arc_c
2864          */
2865         target = arc_mru_ghost->arcs_size + arc_mfu_ghost->arcs_size - arc_c;
2866
2867         bytes = arc_adjust_impl(arc_mfu_ghost, 0, target, ARC_BUFC_DATA);
2868         total_evicted += bytes;
2869
2870         target -= bytes;
2871
2872         total_evicted +=
2873             arc_adjust_impl(arc_mfu_ghost, 0, target, ARC_BUFC_METADATA);
2874
2875         return (total_evicted);
2876 }
2877
2878 static void
2879 arc_do_user_evicts(void)
2880 {
2881         mutex_enter(&arc_user_evicts_lock);
2882         while (arc_eviction_list != NULL) {
2883                 arc_buf_t *buf = arc_eviction_list;
2884                 arc_eviction_list = buf->b_next;
2885                 mutex_enter(&buf->b_evict_lock);
2886                 buf->b_hdr = NULL;
2887                 mutex_exit(&buf->b_evict_lock);
2888                 mutex_exit(&arc_user_evicts_lock);
2889
2890                 if (buf->b_efunc != NULL)
2891                         VERIFY0(buf->b_efunc(buf->b_private));
2892
2893                 buf->b_efunc = NULL;
2894                 buf->b_private = NULL;
2895                 kmem_cache_free(buf_cache, buf);
2896                 mutex_enter(&arc_user_evicts_lock);
2897         }
2898         mutex_exit(&arc_user_evicts_lock);
2899 }
2900
2901 void
2902 arc_flush(spa_t *spa, boolean_t retry)
2903 {
2904         uint64_t guid = 0;
2905
2906         /*
2907          * If retry is TRUE, a spa must not be specified since we have
2908          * no good way to determine if all of a spa's buffers have been
2909          * evicted from an arc state.
2910          */
2911         ASSERT(!retry || spa == 0);
2912
2913         if (spa != NULL)
2914                 guid = spa_load_guid(spa);
2915
2916         (void) arc_flush_state(arc_mru, guid, ARC_BUFC_DATA, retry);
2917         (void) arc_flush_state(arc_mru, guid, ARC_BUFC_METADATA, retry);
2918
2919         (void) arc_flush_state(arc_mfu, guid, ARC_BUFC_DATA, retry);
2920         (void) arc_flush_state(arc_mfu, guid, ARC_BUFC_METADATA, retry);
2921
2922         (void) arc_flush_state(arc_mru_ghost, guid, ARC_BUFC_DATA, retry);
2923         (void) arc_flush_state(arc_mru_ghost, guid, ARC_BUFC_METADATA, retry);
2924
2925         (void) arc_flush_state(arc_mfu_ghost, guid, ARC_BUFC_DATA, retry);
2926         (void) arc_flush_state(arc_mfu_ghost, guid, ARC_BUFC_METADATA, retry);
2927
2928         arc_do_user_evicts();
2929         ASSERT(spa || arc_eviction_list == NULL);
2930 }
2931
2932 void
2933 arc_shrink(uint64_t bytes)
2934 {
2935         if (arc_c > arc_c_min) {
2936                 uint64_t to_free;
2937
2938                 to_free = bytes ? bytes : arc_c >> zfs_arc_shrink_shift;
2939
2940                 if (arc_c > arc_c_min + to_free)
2941                         atomic_add_64(&arc_c, -to_free);
2942                 else
2943                         arc_c = arc_c_min;
2944
2945                 to_free = bytes ? bytes : arc_p >> zfs_arc_shrink_shift;
2946
2947                 if (arc_p > to_free)
2948                         atomic_add_64(&arc_p, -to_free);
2949                 else
2950                         arc_p = 0;
2951
2952                 if (arc_c > arc_size)
2953                         arc_c = MAX(arc_size, arc_c_min);
2954                 if (arc_p > arc_c)
2955                         arc_p = (arc_c >> 1);
2956                 ASSERT(arc_c >= arc_c_min);
2957                 ASSERT((int64_t)arc_p >= 0);
2958         }
2959
2960         if (arc_size > arc_c)
2961                 (void) arc_adjust();
2962 }
2963
2964 static void
2965 arc_kmem_reap_now(arc_reclaim_strategy_t strat, uint64_t bytes)
2966 {
2967         size_t                  i;
2968         kmem_cache_t            *prev_cache = NULL;
2969         kmem_cache_t            *prev_data_cache = NULL;
2970         extern kmem_cache_t     *zio_buf_cache[];
2971         extern kmem_cache_t     *zio_data_buf_cache[];
2972
2973         if ((arc_meta_used >= arc_meta_limit) && zfs_arc_meta_prune) {
2974                 /*
2975                  * We are exceeding our meta-data cache limit.
2976                  * Prune some entries to release holds on meta-data.
2977                  */
2978                 arc_prune(zfs_arc_meta_prune);
2979         }
2980
2981         /*
2982          * An aggressive reclamation will shrink the cache size as well as
2983          * reap free buffers from the arc kmem caches.
2984          */
2985         if (strat == ARC_RECLAIM_AGGR)
2986                 arc_shrink(bytes);
2987
2988         for (i = 0; i < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; i++) {
2989                 if (zio_buf_cache[i] != prev_cache) {
2990                         prev_cache = zio_buf_cache[i];
2991                         kmem_cache_reap_now(zio_buf_cache[i]);
2992                 }
2993                 if (zio_data_buf_cache[i] != prev_data_cache) {
2994                         prev_data_cache = zio_data_buf_cache[i];
2995                         kmem_cache_reap_now(zio_data_buf_cache[i]);
2996                 }
2997         }
2998
2999         kmem_cache_reap_now(buf_cache);
3000         kmem_cache_reap_now(hdr_full_cache);
3001         kmem_cache_reap_now(hdr_l2only_cache);
3002 }
3003
3004 /*
3005  * Threads can block in arc_get_data_buf() waiting for this thread to evict
3006  * enough data and signal them to proceed. When this happens, the threads in
3007  * arc_get_data_buf() are sleeping while holding the hash lock for their
3008  * particular arc header. Thus, we must be careful to never sleep on a
3009  * hash lock in this thread. This is to prevent the following deadlock:
3010  *
3011  *  - Thread A sleeps on CV in arc_get_data_buf() holding hash lock "L",
3012  *    waiting for the reclaim thread to signal it.
3013  *
3014  *  - arc_reclaim_thread() tries to acquire hash lock "L" using mutex_enter,
3015  *    fails, and goes to sleep forever.
3016  *
3017  * This possible deadlock is avoided by always acquiring a hash lock
3018  * using mutex_tryenter() from arc_reclaim_thread().
3019  */
3020 static void
3021 arc_adapt_thread(void)
3022 {
3023         callb_cpr_t             cpr;
3024         fstrans_cookie_t        cookie;
3025         uint64_t                arc_evicted;
3026
3027         CALLB_CPR_INIT(&cpr, &arc_reclaim_lock, callb_generic_cpr, FTAG);
3028
3029         cookie = spl_fstrans_mark();
3030         mutex_enter(&arc_reclaim_lock);
3031         while (arc_reclaim_thread_exit == 0) {
3032 #ifndef _KERNEL
3033                 arc_reclaim_strategy_t  last_reclaim = ARC_RECLAIM_CONS;
3034
3035                 mutex_exit(&arc_reclaim_lock);
3036                 if (spa_get_random(100) == 0) {
3037
3038                         if (arc_no_grow) {
3039                                 if (last_reclaim == ARC_RECLAIM_CONS) {
3040                                         last_reclaim = ARC_RECLAIM_AGGR;
3041                                 } else {
3042                                         last_reclaim = ARC_RECLAIM_CONS;
3043                                 }
3044                         } else {
3045                                 arc_no_grow = TRUE;
3046                                 last_reclaim = ARC_RECLAIM_AGGR;
3047                                 membar_producer();
3048                         }
3049
3050                         /* reset the growth delay for every reclaim */
3051                         arc_grow_time = ddi_get_lbolt() +
3052                             (zfs_arc_grow_retry * hz);
3053
3054                         arc_kmem_reap_now(last_reclaim, 0);
3055                         arc_warm = B_TRUE;
3056                 }
3057 #else /* _KERNEL */
3058                 mutex_exit(&arc_reclaim_lock);
3059 #endif /* !_KERNEL */
3060
3061                 /* No recent memory pressure allow the ARC to grow. */
3062                 if (arc_no_grow &&
3063                     ddi_time_after_eq(ddi_get_lbolt(), arc_grow_time))
3064                         arc_no_grow = FALSE;
3065
3066                 arc_evicted = arc_adjust();
3067
3068                 /*
3069                  * We're either no longer overflowing, or we
3070                  * can't evict anything more, so we should wake
3071                  * up any threads before we go to sleep.
3072                  */
3073                 if (arc_size <= arc_c || arc_evicted == 0)
3074                         cv_broadcast(&arc_reclaim_waiters_cv);
3075
3076                 mutex_enter(&arc_reclaim_lock);
3077
3078                 /* block until needed, or one second, whichever is shorter */
3079                 CALLB_CPR_SAFE_BEGIN(&cpr);
3080                 (void) cv_timedwait_interruptible(&arc_reclaim_thread_cv,
3081                     &arc_reclaim_lock, (ddi_get_lbolt() + hz));
3082                 CALLB_CPR_SAFE_END(&cpr, &arc_reclaim_lock);
3083
3084
3085                 /* Allow the module options to be changed */
3086                 if (zfs_arc_max > 64 << 20 &&
3087                     zfs_arc_max < physmem * PAGESIZE &&
3088                     zfs_arc_max != arc_c_max)
3089                         arc_c_max = zfs_arc_max;
3090
3091                 if (zfs_arc_min > 0 &&
3092                     zfs_arc_min < arc_c_max &&
3093                     zfs_arc_min != arc_c_min)
3094                         arc_c_min = zfs_arc_min;
3095
3096                 if (zfs_arc_meta_limit > 0 &&
3097                     zfs_arc_meta_limit <= arc_c_max &&
3098                     zfs_arc_meta_limit != arc_meta_limit)
3099                         arc_meta_limit = zfs_arc_meta_limit;
3100         }
3101
3102         arc_reclaim_thread_exit = 0;
3103         cv_broadcast(&arc_reclaim_thread_cv);
3104         CALLB_CPR_EXIT(&cpr);           /* drops arc_reclaim_lock */
3105         spl_fstrans_unmark(cookie);
3106         thread_exit();
3107 }
3108
3109 static void
3110 arc_user_evicts_thread(void)
3111 {
3112         callb_cpr_t cpr;
3113         fstrans_cookie_t        cookie;
3114
3115         CALLB_CPR_INIT(&cpr, &arc_user_evicts_lock, callb_generic_cpr, FTAG);
3116
3117         cookie = spl_fstrans_mark();
3118         mutex_enter(&arc_user_evicts_lock);
3119         while (!arc_user_evicts_thread_exit) {
3120                 mutex_exit(&arc_user_evicts_lock);
3121
3122                 arc_do_user_evicts();
3123
3124                 /*
3125                  * This is necessary in order for the mdb ::arc dcmd to
3126                  * show up to date information. Since the ::arc command
3127                  * does not call the kstat's update function, without
3128                  * this call, the command may show stale stats for the
3129                  * anon, mru, mru_ghost, mfu, and mfu_ghost lists. Even
3130                  * with this change, the data might be up to 1 second
3131                  * out of date; but that should suffice. The arc_state_t
3132                  * structures can be queried directly if more accurate
3133                  * information is needed.
3134                  */
3135                 if (arc_ksp != NULL)
3136                         arc_ksp->ks_update(arc_ksp, KSTAT_READ);
3137
3138                 mutex_enter(&arc_user_evicts_lock);
3139
3140                 /*
3141                  * Block until signaled, or after one second (we need to
3142                  * call the arc's kstat update function regularly).
3143                  */
3144                 CALLB_CPR_SAFE_BEGIN(&cpr);
3145                 (void) cv_timedwait_interruptible(&arc_user_evicts_cv,
3146                     &arc_user_evicts_lock, ddi_get_lbolt() + hz);
3147                 CALLB_CPR_SAFE_END(&cpr, &arc_user_evicts_lock);
3148         }
3149
3150         arc_user_evicts_thread_exit = FALSE;
3151         cv_broadcast(&arc_user_evicts_cv);
3152         CALLB_CPR_EXIT(&cpr);           /* drops arc_user_evicts_lock */
3153         spl_fstrans_unmark(cookie);
3154         thread_exit();
3155 }
3156
3157 #ifdef _KERNEL
3158 /*
3159  * Determine the amount of memory eligible for eviction contained in the
3160  * ARC. All clean data reported by the ghost lists can always be safely
3161  * evicted. Due to arc_c_min, the same does not hold for all clean data
3162  * contained by the regular mru and mfu lists.
3163  *
3164  * In the case of the regular mru and mfu lists, we need to report as
3165  * much clean data as possible, such that evicting that same reported
3166  * data will not bring arc_size below arc_c_min. Thus, in certain
3167  * circumstances, the total amount of clean data in the mru and mfu
3168  * lists might not actually be evictable.
3169  *
3170  * The following two distinct cases are accounted for:
3171  *
3172  * 1. The sum of the amount of dirty data contained by both the mru and
3173  *    mfu lists, plus the ARC's other accounting (e.g. the anon list),
3174  *    is greater than or equal to arc_c_min.
3175  *    (i.e. amount of dirty data >= arc_c_min)
3176  *
3177  *    This is the easy case; all clean data contained by the mru and mfu
3178  *    lists is evictable. Evicting all clean data can only drop arc_size
3179  *    to the amount of dirty data, which is greater than arc_c_min.
3180  *
3181  * 2. The sum of the amount of dirty data contained by both the mru and
3182  *    mfu lists, plus the ARC's other accounting (e.g. the anon list),
3183  *    is less than arc_c_min.
3184  *    (i.e. arc_c_min > amount of dirty data)
3185  *
3186  *    2.1. arc_size is greater than or equal arc_c_min.
3187  *         (i.e. arc_size >= arc_c_min > amount of dirty data)
3188  *
3189  *         In this case, not all clean data from the regular mru and mfu
3190  *         lists is actually evictable; we must leave enough clean data
3191  *         to keep arc_size above arc_c_min. Thus, the maximum amount of
3192  *         evictable data from the two lists combined, is exactly the
3193  *         difference between arc_size and arc_c_min.
3194  *
3195  *    2.2. arc_size is less than arc_c_min
3196  *         (i.e. arc_c_min > arc_size > amount of dirty data)
3197  *
3198  *         In this case, none of the data contained in the mru and mfu
3199  *         lists is evictable, even if it's clean. Since arc_size is
3200  *         already below arc_c_min, evicting any more would only
3201  *         increase this negative difference.
3202  */
3203 static uint64_t
3204 arc_evictable_memory(void) {
3205         uint64_t arc_clean =
3206             arc_mru->arcs_lsize[ARC_BUFC_DATA] +
3207             arc_mru->arcs_lsize[ARC_BUFC_METADATA] +
3208             arc_mfu->arcs_lsize[ARC_BUFC_DATA] +
3209             arc_mfu->arcs_lsize[ARC_BUFC_METADATA];
3210         uint64_t ghost_clean =
3211             arc_mru_ghost->arcs_lsize[ARC_BUFC_DATA] +
3212             arc_mru_ghost->arcs_lsize[ARC_BUFC_METADATA] +
3213             arc_mfu_ghost->arcs_lsize[ARC_BUFC_DATA] +
3214             arc_mfu_ghost->arcs_lsize[ARC_BUFC_METADATA];
3215         uint64_t arc_dirty = MAX((int64_t)arc_size - (int64_t)arc_clean, 0);
3216
3217         if (arc_dirty >= arc_c_min)
3218                 return (ghost_clean + arc_clean);
3219
3220         return (ghost_clean + MAX((int64_t)arc_size - (int64_t)arc_c_min, 0));
3221 }
3222
3223 /*
3224  * If sc->nr_to_scan is zero, the caller is requesting a query of the
3225  * number of objects which can potentially be freed.  If it is nonzero,
3226  * the request is to free that many objects.
3227  *
3228  * Linux kernels >= 3.12 have the count_objects and scan_objects callbacks
3229  * in struct shrinker and also require the shrinker to return the number
3230  * of objects freed.
3231  *
3232  * Older kernels require the shrinker to return the number of freeable
3233  * objects following the freeing of nr_to_free.
3234  */
3235 static spl_shrinker_t
3236 __arc_shrinker_func(struct shrinker *shrink, struct shrink_control *sc)
3237 {
3238         int64_t pages;
3239
3240         /* The arc is considered warm once reclaim has occurred */
3241         if (unlikely(arc_warm == B_FALSE))
3242                 arc_warm = B_TRUE;
3243
3244         /* Return the potential number of reclaimable pages */
3245         pages = btop((int64_t)arc_evictable_memory());
3246         if (sc->nr_to_scan == 0)
3247                 return (pages);
3248
3249         /* Not allowed to perform filesystem reclaim */
3250         if (!(sc->gfp_mask & __GFP_FS))
3251                 return (SHRINK_STOP);
3252
3253         /* Reclaim in progress */
3254         if (mutex_tryenter(&arc_reclaim_lock) == 0)
3255                 return (SHRINK_STOP);
3256
3257         mutex_exit(&arc_reclaim_lock);
3258
3259         /*
3260          * Evict the requested number of pages by shrinking arc_c the
3261          * requested amount.  If there is nothing left to evict just
3262          * reap whatever we can from the various arc slabs.
3263          */
3264         if (pages > 0) {
3265                 arc_kmem_reap_now(ARC_RECLAIM_AGGR, ptob(sc->nr_to_scan));
3266
3267 #ifdef HAVE_SPLIT_SHRINKER_CALLBACK
3268                 pages = MAX(pages - btop(arc_evictable_memory()), 0);
3269 #else
3270                 pages = btop(arc_evictable_memory());
3271 #endif
3272         } else {
3273                 arc_kmem_reap_now(ARC_RECLAIM_CONS, ptob(sc->nr_to_scan));
3274                 pages = SHRINK_STOP;
3275         }
3276
3277         /*
3278          * We've reaped what we can, wake up threads.
3279          */
3280         cv_broadcast(&arc_reclaim_waiters_cv);
3281
3282         /*
3283          * When direct reclaim is observed it usually indicates a rapid
3284          * increase in memory pressure.  This occurs because the kswapd
3285          * threads were unable to asynchronously keep enough free memory
3286          * available.  In this case set arc_no_grow to briefly pause arc
3287          * growth to avoid compounding the memory pressure.
3288          */
3289         if (current_is_kswapd()) {
3290                 ARCSTAT_BUMP(arcstat_memory_indirect_count);
3291         } else {
3292                 arc_no_grow = B_TRUE;
3293                 arc_grow_time = ddi_get_lbolt() + (zfs_arc_grow_retry * hz);
3294                 ARCSTAT_BUMP(arcstat_memory_direct_count);
3295         }
3296
3297         return (pages);
3298 }
3299 SPL_SHRINKER_CALLBACK_WRAPPER(arc_shrinker_func);
3300
3301 SPL_SHRINKER_DECLARE(arc_shrinker, arc_shrinker_func, DEFAULT_SEEKS);
3302 #endif /* _KERNEL */
3303
3304 /*
3305  * Adapt arc info given the number of bytes we are trying to add and
3306  * the state that we are comming from.  This function is only called
3307  * when we are adding new content to the cache.
3308  */
3309 static void
3310 arc_adapt(int bytes, arc_state_t *state)
3311 {
3312         int mult;
3313
3314         if (state == arc_l2c_only)
3315                 return;
3316
3317         ASSERT(bytes > 0);
3318         /*
3319          * Adapt the target size of the MRU list:
3320          *      - if we just hit in the MRU ghost list, then increase
3321          *        the target size of the MRU list.
3322          *      - if we just hit in the MFU ghost list, then increase
3323          *        the target size of the MFU list by decreasing the
3324          *        target size of the MRU list.
3325          */
3326         if (state == arc_mru_ghost) {
3327                 mult = ((arc_mru_ghost->arcs_size >= arc_mfu_ghost->arcs_size) ?
3328                     1 : (arc_mfu_ghost->arcs_size/arc_mru_ghost->arcs_size));
3329
3330                 if (!zfs_arc_p_dampener_disable)
3331                         mult = MIN(mult, 10); /* avoid wild arc_p adjustment */
3332
3333                 arc_p = MIN(arc_c, arc_p + bytes * mult);
3334         } else if (state == arc_mfu_ghost) {
3335                 uint64_t delta;
3336
3337                 mult = ((arc_mfu_ghost->arcs_size >= arc_mru_ghost->arcs_size) ?
3338                     1 : (arc_mru_ghost->arcs_size/arc_mfu_ghost->arcs_size));
3339
3340                 if (!zfs_arc_p_dampener_disable)
3341                         mult = MIN(mult, 10);
3342
3343                 delta = MIN(bytes * mult, arc_p);
3344                 arc_p = MAX(0, arc_p - delta);
3345         }
3346         ASSERT((int64_t)arc_p >= 0);
3347
3348         if (arc_no_grow)
3349                 return;
3350
3351         if (arc_c >= arc_c_max)
3352                 return;
3353
3354         /*
3355          * If we're within (2 * maxblocksize) bytes of the target
3356          * cache size, increment the target cache size
3357          */
3358         if (arc_size > arc_c - (2ULL << SPA_MAXBLOCKSHIFT)) {
3359                 atomic_add_64(&arc_c, (int64_t)bytes);
3360                 if (arc_c > arc_c_max)
3361                         arc_c = arc_c_max;
3362                 else if (state == arc_anon)
3363                         atomic_add_64(&arc_p, (int64_t)bytes);
3364                 if (arc_p > arc_c)
3365                         arc_p = arc_c;
3366         }
3367         ASSERT((int64_t)arc_p >= 0);
3368 }
3369
3370 /*
3371  * Check if arc_size has grown past our upper threshold, determined by
3372  * zfs_arc_overflow_shift.
3373  */
3374 static boolean_t
3375 arc_is_overflowing(void)
3376 {
3377         /* Always allow at least one block of overflow */
3378         uint64_t overflow = MAX(SPA_MAXBLOCKSIZE,
3379             arc_c >> zfs_arc_overflow_shift);
3380
3381         return (arc_size >= arc_c + overflow);
3382 }
3383
3384 /*
3385  * The buffer, supplied as the first argument, needs a data block. If we
3386  * are hitting the hard limit for the cache size, we must sleep, waiting
3387  * for the eviction thread to catch up. If we're past the target size
3388  * but below the hard limit, we'll only signal the reclaim thread and
3389  * continue on.
3390  */
3391 static void
3392 arc_get_data_buf(arc_buf_t *buf)
3393 {
3394         arc_state_t             *state = buf->b_hdr->b_l1hdr.b_state;
3395         uint64_t                size = buf->b_hdr->b_size;
3396         arc_buf_contents_t      type = arc_buf_type(buf->b_hdr);
3397
3398         arc_adapt(size, state);
3399
3400         /*
3401          * If arc_size is currently overflowing, and has grown past our
3402          * upper limit, we must be adding data faster than the evict
3403          * thread can evict. Thus, to ensure we don't compound the
3404          * problem by adding more data and forcing arc_size to grow even
3405          * further past it's target size, we halt and wait for the
3406          * eviction thread to catch up.
3407          *
3408          * It's also possible that the reclaim thread is unable to evict
3409          * enough buffers to get arc_size below the overflow limit (e.g.
3410          * due to buffers being un-evictable, or hash lock collisions).
3411          * In this case, we want to proceed regardless if we're
3412          * overflowing; thus we don't use a while loop here.
3413          */
3414         if (arc_is_overflowing()) {
3415                 mutex_enter(&arc_reclaim_lock);
3416
3417                 /*
3418                  * Now that we've acquired the lock, we may no longer be
3419                  * over the overflow limit, lets check.
3420                  *
3421                  * We're ignoring the case of spurious wake ups. If that
3422                  * were to happen, it'd let this thread consume an ARC
3423                  * buffer before it should have (i.e. before we're under
3424                  * the overflow limit and were signalled by the reclaim
3425                  * thread). As long as that is a rare occurrence, it
3426                  * shouldn't cause any harm.
3427                  */
3428                 if (arc_is_overflowing()) {
3429                         cv_signal(&arc_reclaim_thread_cv);
3430                         cv_wait(&arc_reclaim_waiters_cv, &arc_reclaim_lock);
3431                 }
3432
3433                 mutex_exit(&arc_reclaim_lock);
3434         }
3435
3436         if (type == ARC_BUFC_METADATA) {
3437                 buf->b_data = zio_buf_alloc(size);
3438                 arc_space_consume(size, ARC_SPACE_META);
3439         } else {
3440                 ASSERT(type == ARC_BUFC_DATA);
3441                 buf->b_data = zio_data_buf_alloc(size);
3442                 arc_space_consume(size, ARC_SPACE_DATA);
3443         }
3444
3445         /*
3446          * Update the state size.  Note that ghost states have a
3447          * "ghost size" and so don't need to be updated.
3448          */
3449         if (!GHOST_STATE(buf->b_hdr->b_l1hdr.b_state)) {
3450                 arc_buf_hdr_t *hdr = buf->b_hdr;
3451
3452                 atomic_add_64(&hdr->b_l1hdr.b_state->arcs_size, size);
3453
3454                 /*
3455                  * If this is reached via arc_read, the link is
3456                  * protected by the hash lock. If reached via
3457                  * arc_buf_alloc, the header should not be accessed by
3458                  * any other thread. And, if reached via arc_read_done,
3459                  * the hash lock will protect it if it's found in the
3460                  * hash table; otherwise no other thread should be
3461                  * trying to [add|remove]_reference it.
3462                  */
3463                 if (multilist_link_active(&hdr->b_l1hdr.b_arc_node)) {
3464                         ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
3465                         atomic_add_64(&hdr->b_l1hdr.b_state->arcs_lsize[type],
3466                             size);
3467                 }
3468                 /*
3469                  * If we are growing the cache, and we are adding anonymous
3470                  * data, and we have outgrown arc_p, update arc_p
3471                  */
3472                 if (arc_size < arc_c && hdr->b_l1hdr.b_state == arc_anon &&
3473                     arc_anon->arcs_size + arc_mru->arcs_size > arc_p)
3474                         arc_p = MIN(arc_c, arc_p + size);
3475         }
3476 }
3477
3478 /*
3479  * This routine is called whenever a buffer is accessed.
3480  * NOTE: the hash lock is dropped in this function.
3481  */
3482 static void
3483 arc_access(arc_buf_hdr_t *hdr, kmutex_t *hash_lock)
3484 {
3485         clock_t now;
3486
3487         ASSERT(MUTEX_HELD(hash_lock));
3488         ASSERT(HDR_HAS_L1HDR(hdr));
3489
3490         if (hdr->b_l1hdr.b_state == arc_anon) {
3491                 /*
3492                  * This buffer is not in the cache, and does not
3493                  * appear in our "ghost" list.  Add the new buffer
3494                  * to the MRU state.
3495                  */
3496
3497                 ASSERT0(hdr->b_l1hdr.b_arc_access);
3498                 hdr->b_l1hdr.b_arc_access = ddi_get_lbolt();
3499                 DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, hdr);
3500                 arc_change_state(arc_mru, hdr, hash_lock);
3501
3502         } else if (hdr->b_l1hdr.b_state == arc_mru) {
3503                 now = ddi_get_lbolt();
3504
3505                 /*
3506                  * If this buffer is here because of a prefetch, then either:
3507                  * - clear the flag if this is a "referencing" read
3508                  *   (any subsequent access will bump this into the MFU state).
3509                  * or
3510                  * - move the buffer to the head of the list if this is
3511                  *   another prefetch (to make it less likely to be evicted).
3512                  */
3513                 if (HDR_PREFETCH(hdr)) {
3514                         if (refcount_count(&hdr->b_l1hdr.b_refcnt) == 0) {
3515                                 /* link protected by hash lock */
3516                                 ASSERT(multilist_link_active(
3517                                     &hdr->b_l1hdr.b_arc_node));
3518                         } else {
3519                                 hdr->b_flags &= ~ARC_FLAG_PREFETCH;
3520                                 atomic_inc_32(&hdr->b_l1hdr.b_mru_hits);
3521                                 ARCSTAT_BUMP(arcstat_mru_hits);
3522                         }
3523                         hdr->b_l1hdr.b_arc_access = now;
3524                         return;
3525                 }
3526
3527                 /*
3528                  * This buffer has been "accessed" only once so far,
3529                  * but it is still in the cache. Move it to the MFU
3530                  * state.
3531                  */
3532                 if (ddi_time_after(now, hdr->b_l1hdr.b_arc_access +
3533                     ARC_MINTIME)) {
3534                         /*
3535                          * More than 125ms have passed since we
3536                          * instantiated this buffer.  Move it to the
3537                          * most frequently used state.
3538                          */
3539                         hdr->b_l1hdr.b_arc_access = now;
3540                         DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr);
3541                         arc_change_state(arc_mfu, hdr, hash_lock);
3542                 }
3543                 atomic_inc_32(&hdr->b_l1hdr.b_mru_hits);
3544                 ARCSTAT_BUMP(arcstat_mru_hits);
3545         } else if (hdr->b_l1hdr.b_state == arc_mru_ghost) {
3546                 arc_state_t     *new_state;
3547                 /*
3548                  * This buffer has been "accessed" recently, but
3549                  * was evicted from the cache.  Move it to the
3550                  * MFU state.
3551                  */
3552
3553                 if (HDR_PREFETCH(hdr)) {
3554                         new_state = arc_mru;
3555                         if (refcount_count(&hdr->b_l1hdr.b_refcnt) > 0)
3556                                 hdr->b_flags &= ~ARC_FLAG_PREFETCH;
3557                         DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, hdr);
3558                 } else {
3559                         new_state = arc_mfu;
3560                         DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr);
3561                 }
3562
3563                 hdr->b_l1hdr.b_arc_access = ddi_get_lbolt();
3564                 arc_change_state(new_state, hdr, hash_lock);
3565
3566                 atomic_inc_32(&hdr->b_l1hdr.b_mru_ghost_hits);
3567                 ARCSTAT_BUMP(arcstat_mru_ghost_hits);
3568         } else if (hdr->b_l1hdr.b_state == arc_mfu) {
3569                 /*
3570                  * This buffer has been accessed more than once and is
3571                  * still in the cache.  Keep it in the MFU state.
3572                  *
3573                  * NOTE: an add_reference() that occurred when we did
3574                  * the arc_read() will have kicked this off the list.
3575                  * If it was a prefetch, we will explicitly move it to
3576                  * the head of the list now.
3577                  */
3578                 if ((HDR_PREFETCH(hdr)) != 0) {
3579                         ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
3580                         /* link protected by hash_lock */
3581                         ASSERT(multilist_link_active(&hdr->b_l1hdr.b_arc_node));
3582                 }
3583                 atomic_inc_32(&hdr->b_l1hdr.b_mfu_hits);
3584                 ARCSTAT_BUMP(arcstat_mfu_hits);
3585                 hdr->b_l1hdr.b_arc_access = ddi_get_lbolt();
3586         } else if (hdr->b_l1hdr.b_state == arc_mfu_ghost) {
3587                 arc_state_t     *new_state = arc_mfu;
3588                 /*
3589                  * This buffer has been accessed more than once but has
3590                  * been evicted from the cache.  Move it back to the
3591                  * MFU state.
3592                  */
3593
3594                 if (HDR_PREFETCH(hdr)) {
3595                         /*
3596                          * This is a prefetch access...
3597                          * move this block back to the MRU state.
3598                          */
3599                         ASSERT0(refcount_count(&hdr->b_l1hdr.b_refcnt));
3600                         new_state = arc_mru;
3601                 }
3602
3603                 hdr->b_l1hdr.b_arc_access = ddi_get_lbolt();
3604                 DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr);
3605                 arc_change_state(new_state, hdr, hash_lock);
3606
3607                 atomic_inc_32(&hdr->b_l1hdr.b_mfu_ghost_hits);
3608                 ARCSTAT_BUMP(arcstat_mfu_ghost_hits);
3609         } else if (hdr->b_l1hdr.b_state == arc_l2c_only) {
3610                 /*
3611                  * This buffer is on the 2nd Level ARC.
3612                  */
3613
3614                 hdr->b_l1hdr.b_arc_access = ddi_get_lbolt();
3615                 DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr);
3616                 arc_change_state(arc_mfu, hdr, hash_lock);
3617         } else {
3618                 cmn_err(CE_PANIC, "invalid arc state 0x%p",
3619                     hdr->b_l1hdr.b_state);
3620         }
3621 }
3622
3623 /* a generic arc_done_func_t which you can use */
3624 /* ARGSUSED */
3625 void
3626 arc_bcopy_func(zio_t *zio, arc_buf_t *buf, void *arg)
3627 {
3628         if (zio == NULL || zio->io_error == 0)
3629                 bcopy(buf->b_data, arg, buf->b_hdr->b_size);
3630         VERIFY(arc_buf_remove_ref(buf, arg));
3631 }
3632
3633 /* a generic arc_done_func_t */
3634 void
3635 arc_getbuf_func(zio_t *zio, arc_buf_t *buf, void *arg)
3636 {
3637         arc_buf_t **bufp = arg;
3638         if (zio && zio->io_error) {
3639                 VERIFY(arc_buf_remove_ref(buf, arg));
3640                 *bufp = NULL;
3641         } else {
3642                 *bufp = buf;
3643                 ASSERT(buf->b_data);
3644         }
3645 }
3646
3647 static void
3648 arc_read_done(zio_t *zio)
3649 {
3650         arc_buf_hdr_t   *hdr;
3651         arc_buf_t       *buf;
3652         arc_buf_t       *abuf;  /* buffer we're assigning to callback */
3653         kmutex_t        *hash_lock = NULL;
3654         arc_callback_t  *callback_list, *acb;
3655         int             freeable = FALSE;
3656
3657         buf = zio->io_private;
3658         hdr = buf->b_hdr;
3659
3660         /*
3661          * The hdr was inserted into hash-table and removed from lists
3662          * prior to starting I/O.  We should find this header, since
3663          * it's in the hash table, and it should be legit since it's
3664          * not possible to evict it during the I/O.  The only possible
3665          * reason for it not to be found is if we were freed during the
3666          * read.
3667          */
3668         if (HDR_IN_HASH_TABLE(hdr)) {
3669                 arc_buf_hdr_t *found;
3670
3671                 ASSERT3U(hdr->b_birth, ==, BP_PHYSICAL_BIRTH(zio->io_bp));
3672                 ASSERT3U(hdr->b_dva.dva_word[0], ==,
3673                     BP_IDENTITY(zio->io_bp)->dva_word[0]);
3674                 ASSERT3U(hdr->b_dva.dva_word[1], ==,
3675                     BP_IDENTITY(zio->io_bp)->dva_word[1]);
3676
3677                 found = buf_hash_find(hdr->b_spa, zio->io_bp,
3678                     &hash_lock);
3679
3680                 ASSERT((found == NULL && HDR_FREED_IN_READ(hdr) &&
3681                     hash_lock == NULL) ||
3682                     (found == hdr &&
3683                     DVA_EQUAL(&hdr->b_dva, BP_IDENTITY(zio->io_bp))) ||
3684                     (found == hdr && HDR_L2_READING(hdr)));
3685         }
3686
3687         hdr->b_flags &= ~ARC_FLAG_L2_EVICTED;
3688         if (l2arc_noprefetch && HDR_PREFETCH(hdr))
3689                 hdr->b_flags &= ~ARC_FLAG_L2CACHE;
3690
3691         /* byteswap if necessary */
3692         callback_list = hdr->b_l1hdr.b_acb;
3693         ASSERT(callback_list != NULL);
3694         if (BP_SHOULD_BYTESWAP(zio->io_bp) && zio->io_error == 0) {
3695                 dmu_object_byteswap_t bswap =
3696                     DMU_OT_BYTESWAP(BP_GET_TYPE(zio->io_bp));
3697                 if (BP_GET_LEVEL(zio->io_bp) > 0)
3698                     byteswap_uint64_array(buf->b_data, hdr->b_size);
3699                 else
3700                     dmu_ot_byteswap[bswap].ob_func(buf->b_data, hdr->b_size);
3701         }
3702
3703         arc_cksum_compute(buf, B_FALSE);
3704         arc_buf_watch(buf);
3705
3706         if (hash_lock && zio->io_error == 0 &&
3707             hdr->b_l1hdr.b_state == arc_anon) {
3708                 /*
3709                  * Only call arc_access on anonymous buffers.  This is because
3710                  * if we've issued an I/O for an evicted buffer, we've already
3711                  * called arc_access (to prevent any simultaneous readers from
3712                  * getting confused).
3713                  */
3714                 arc_access(hdr, hash_lock);
3715         }
3716
3717         /* create copies of the data buffer for the callers */
3718         abuf = buf;
3719         for (acb = callback_list; acb; acb = acb->acb_next) {
3720                 if (acb->acb_done) {
3721                         if (abuf == NULL) {
3722                                 ARCSTAT_BUMP(arcstat_duplicate_reads);
3723                                 abuf = arc_buf_clone(buf);
3724                         }
3725                         acb->acb_buf = abuf;
3726                         abuf = NULL;
3727                 }
3728         }
3729         hdr->b_l1hdr.b_acb = NULL;
3730         hdr->b_flags &= ~ARC_FLAG_IO_IN_PROGRESS;
3731         ASSERT(!HDR_BUF_AVAILABLE(hdr));
3732         if (abuf == buf) {
3733                 ASSERT(buf->b_efunc == NULL);
3734                 ASSERT(hdr->b_l1hdr.b_datacnt == 1);
3735                 hdr->b_flags |= ARC_FLAG_BUF_AVAILABLE;
3736         }
3737
3738         ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt) ||
3739             callback_list != NULL);
3740
3741         if (zio->io_error != 0) {
3742                 hdr->b_flags |= ARC_FLAG_IO_ERROR;
3743                 if (hdr->b_l1hdr.b_state != arc_anon)
3744                         arc_change_state(arc_anon, hdr, hash_lock);
3745                 if (HDR_IN_HASH_TABLE(hdr))
3746                         buf_hash_remove(hdr);
3747                 freeable = refcount_is_zero(&hdr->b_l1hdr.b_refcnt);
3748         }
3749
3750         /*
3751          * Broadcast before we drop the hash_lock to avoid the possibility
3752          * that the hdr (and hence the cv) might be freed before we get to
3753          * the cv_broadcast().
3754          */
3755         cv_broadcast(&hdr->b_l1hdr.b_cv);
3756
3757         if (hash_lock != NULL) {
3758                 mutex_exit(hash_lock);
3759         } else {
3760                 /*
3761                  * This block was freed while we waited for the read to
3762                  * complete.  It has been removed from the hash table and
3763                  * moved to the anonymous state (so that it won't show up
3764                  * in the cache).
3765                  */
3766                 ASSERT3P(hdr->b_l1hdr.b_state, ==, arc_anon);
3767                 freeable = refcount_is_zero(&hdr->b_l1hdr.b_refcnt);
3768         }
3769
3770         /* execute each callback and free its structure */
3771         while ((acb = callback_list) != NULL) {
3772                 if (acb->acb_done)
3773                         acb->acb_done(zio, acb->acb_buf, acb->acb_private);
3774
3775                 if (acb->acb_zio_dummy != NULL) {
3776                         acb->acb_zio_dummy->io_error = zio->io_error;
3777                         zio_nowait(acb->acb_zio_dummy);
3778                 }
3779
3780                 callback_list = acb->acb_next;
3781                 kmem_free(acb, sizeof (arc_callback_t));
3782         }
3783
3784         if (freeable)
3785                 arc_hdr_destroy(hdr);
3786 }
3787
3788 /*
3789  * "Read" the block at the specified DVA (in bp) via the
3790  * cache.  If the block is found in the cache, invoke the provided
3791  * callback immediately and return.  Note that the `zio' parameter
3792  * in the callback will be NULL in this case, since no IO was
3793  * required.  If the block is not in the cache pass the read request
3794  * on to the spa with a substitute callback function, so that the
3795  * requested block will be added to the cache.
3796  *
3797  * If a read request arrives for a block that has a read in-progress,
3798  * either wait for the in-progress read to complete (and return the
3799  * results); or, if this is a read with a "done" func, add a record
3800  * to the read to invoke the "done" func when the read completes,
3801  * and return; or just return.
3802  *
3803  * arc_read_done() will invoke all the requested "done" functions
3804  * for readers of this block.
3805  */
3806 int
3807 arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_done_func_t *done,
3808     void *private, zio_priority_t priority, int zio_flags,
3809     arc_flags_t *arc_flags, const zbookmark_phys_t *zb)
3810 {
3811         arc_buf_hdr_t *hdr = NULL;
3812         arc_buf_t *buf = NULL;
3813         kmutex_t *hash_lock = NULL;
3814         zio_t *rzio;
3815         uint64_t guid = spa_load_guid(spa);
3816         int rc = 0;
3817
3818         ASSERT(!BP_IS_EMBEDDED(bp) ||
3819             BPE_GET_ETYPE(bp) == BP_EMBEDDED_TYPE_DATA);
3820
3821 top:
3822         if (!BP_IS_EMBEDDED(bp)) {
3823                 /*
3824                  * Embedded BP's have no DVA and require no I/O to "read".
3825                  * Create an anonymous arc buf to back it.
3826                  */
3827                 hdr = buf_hash_find(guid, bp, &hash_lock);
3828         }
3829
3830         if (hdr != NULL && HDR_HAS_L1HDR(hdr) && hdr->b_l1hdr.b_datacnt > 0) {
3831
3832                 *arc_flags |= ARC_FLAG_CACHED;
3833
3834                 if (HDR_IO_IN_PROGRESS(hdr)) {
3835
3836                         if (*arc_flags & ARC_FLAG_WAIT) {
3837                                 cv_wait(&hdr->b_l1hdr.b_cv, hash_lock);
3838                                 mutex_exit(hash_lock);
3839                                 goto top;
3840                         }
3841                         ASSERT(*arc_flags & ARC_FLAG_NOWAIT);
3842
3843                         if (done) {
3844                                 arc_callback_t  *acb = NULL;
3845
3846                                 acb = kmem_zalloc(sizeof (arc_callback_t),
3847                                     KM_SLEEP);
3848                                 acb->acb_done = done;
3849                                 acb->acb_private = private;
3850                                 if (pio != NULL)
3851                                         acb->acb_zio_dummy = zio_null(pio,
3852                                             spa, NULL, NULL, NULL, zio_flags);
3853
3854                                 ASSERT(acb->acb_done != NULL);
3855                                 acb->acb_next = hdr->b_l1hdr.b_acb;
3856                                 hdr->b_l1hdr.b_acb = acb;
3857                                 add_reference(hdr, hash_lock, private);
3858                                 mutex_exit(hash_lock);
3859                                 goto out;
3860                         }
3861                         mutex_exit(hash_lock);
3862                         goto out;
3863                 }
3864
3865                 ASSERT(hdr->b_l1hdr.b_state == arc_mru ||
3866                     hdr->b_l1hdr.b_state == arc_mfu);
3867
3868                 if (done) {
3869                         add_reference(hdr, hash_lock, private);
3870                         /*
3871                          * If this block is already in use, create a new
3872                          * copy of the data so that we will be guaranteed
3873                          * that arc_release() will always succeed.
3874                          */
3875                         buf = hdr->b_l1hdr.b_buf;
3876                         ASSERT(buf);
3877                         ASSERT(buf->b_data);
3878                         if (HDR_BUF_AVAILABLE(hdr)) {
3879                                 ASSERT(buf->b_efunc == NULL);
3880                                 hdr->b_flags &= ~ARC_FLAG_BUF_AVAILABLE;
3881                         } else {
3882                                 buf = arc_buf_clone(buf);
3883                         }
3884
3885                 } else if (*arc_flags & ARC_FLAG_PREFETCH &&
3886                     refcount_count(&hdr->b_l1hdr.b_refcnt) == 0) {
3887                         hdr->b_flags |= ARC_FLAG_PREFETCH;
3888                 }
3889                 DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr);
3890                 arc_access(hdr, hash_lock);
3891                 if (*arc_flags & ARC_FLAG_L2CACHE)
3892                         hdr->b_flags |= ARC_FLAG_L2CACHE;
3893                 if (*arc_flags & ARC_FLAG_L2COMPRESS)
3894                         hdr->b_flags |= ARC_FLAG_L2COMPRESS;
3895                 mutex_exit(hash_lock);
3896                 ARCSTAT_BUMP(arcstat_hits);
3897                 ARCSTAT_CONDSTAT(!HDR_PREFETCH(hdr),
3898                     demand, prefetch, !HDR_ISTYPE_METADATA(hdr),
3899                     data, metadata, hits);
3900
3901                 if (done)
3902                         done(NULL, buf, private);
3903         } else {
3904                 uint64_t size = BP_GET_LSIZE(bp);
3905                 arc_callback_t *acb;
3906                 vdev_t *vd = NULL;
3907                 uint64_t addr = 0;
3908                 boolean_t devw = B_FALSE;
3909                 enum zio_compress b_compress = ZIO_COMPRESS_OFF;
3910                 int32_t b_asize = 0;
3911
3912                 /*
3913                  * Gracefully handle a damaged logical block size as a
3914                  * checksum error by passing a dummy zio to the done callback.
3915                  */
3916                 if (size > spa_maxblocksize(spa)) {
3917                         if (done) {
3918                                 rzio = zio_null(pio, spa, NULL,
3919                                     NULL, NULL, zio_flags);
3920                                 rzio->io_error = ECKSUM;
3921                                 done(rzio, buf, private);
3922                                 zio_nowait(rzio);
3923                         }
3924                         rc = ECKSUM;
3925                         goto out;
3926                 }
3927
3928                 if (hdr == NULL) {
3929                         /* this block is not in the cache */
3930                         arc_buf_hdr_t *exists = NULL;
3931                         arc_buf_contents_t type = BP_GET_BUFC_TYPE(bp);
3932                         buf = arc_buf_alloc(spa, size, private, type);
3933                         hdr = buf->b_hdr;
3934                         if (!BP_IS_EMBEDDED(bp)) {
3935                                 hdr->b_dva = *BP_IDENTITY(bp);
3936                                 hdr->b_birth = BP_PHYSICAL_BIRTH(bp);
3937                                 exists = buf_hash_insert(hdr, &hash_lock);
3938                         }
3939                         if (exists != NULL) {
3940                                 /* somebody beat us to the hash insert */
3941                                 mutex_exit(hash_lock);
3942                                 buf_discard_identity(hdr);
3943                                 (void) arc_buf_remove_ref(buf, private);
3944                                 goto top; /* restart the IO request */
3945                         }
3946
3947                         /* if this is a prefetch, we don't have a reference */
3948                         if (*arc_flags & ARC_FLAG_PREFETCH) {
3949                                 (void) remove_reference(hdr, hash_lock,
3950                                     private);
3951                                 hdr->b_flags |= ARC_FLAG_PREFETCH;
3952                         }
3953                         if (*arc_flags & ARC_FLAG_L2CACHE)
3954                                 hdr->b_flags |= ARC_FLAG_L2CACHE;
3955                         if (*arc_flags & ARC_FLAG_L2COMPRESS)
3956                                 hdr->b_flags |= ARC_FLAG_L2COMPRESS;
3957                         if (BP_GET_LEVEL(bp) > 0)
3958                                 hdr->b_flags |= ARC_FLAG_INDIRECT;
3959                 } else {
3960                         /*
3961                          * This block is in the ghost cache. If it was L2-only
3962                          * (and thus didn't have an L1 hdr), we realloc the
3963                          * header to add an L1 hdr.
3964                          */
3965                         if (!HDR_HAS_L1HDR(hdr)) {
3966                                 hdr = arc_hdr_realloc(hdr, hdr_l2only_cache,
3967                                     hdr_full_cache);
3968                         }
3969
3970                         ASSERT(GHOST_STATE(hdr->b_l1hdr.b_state));
3971                         ASSERT(!HDR_IO_IN_PROGRESS(hdr));
3972                         ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
3973                         ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL);
3974
3975                         /* if this is a prefetch, we don't have a reference */
3976                         if (*arc_flags & ARC_FLAG_PREFETCH)
3977                                 hdr->b_flags |= ARC_FLAG_PREFETCH;
3978                         else
3979                                 add_reference(hdr, hash_lock, private);
3980                         if (*arc_flags & ARC_FLAG_L2CACHE)
3981                                 hdr->b_flags |= ARC_FLAG_L2CACHE;
3982                         if (*arc_flags & ARC_FLAG_L2COMPRESS)
3983                                 hdr->b_flags |= ARC_FLAG_L2COMPRESS;
3984                         buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE);
3985                         buf->b_hdr = hdr;
3986                         buf->b_data = NULL;
3987                         buf->b_efunc = NULL;
3988                         buf->b_private = NULL;
3989                         buf->b_next = NULL;
3990                         hdr->b_l1hdr.b_buf = buf;
3991                         ASSERT0(hdr->b_l1hdr.b_datacnt);
3992                         hdr->b_l1hdr.b_datacnt = 1;
3993                         arc_get_data_buf(buf);
3994                         arc_access(hdr, hash_lock);
3995                 }
3996
3997                 ASSERT(!GHOST_STATE(hdr->b_l1hdr.b_state));
3998
3999                 acb = kmem_zalloc(sizeof (arc_callback_t), KM_SLEEP);
4000                 acb->acb_done = done;
4001                 acb->acb_private = private;
4002
4003                 ASSERT(hdr->b_l1hdr.b_acb == NULL);
4004                 hdr->b_l1hdr.b_acb = acb;
4005                 hdr->b_flags |= ARC_FLAG_IO_IN_PROGRESS;
4006
4007                 if (HDR_HAS_L2HDR(hdr) &&
4008                     (vd = hdr->b_l2hdr.b_dev->l2ad_vdev) != NULL) {
4009                         devw = hdr->b_l2hdr.b_dev->l2ad_writing;
4010                         addr = hdr->b_l2hdr.b_daddr;
4011                         b_compress = HDR_GET_COMPRESS(hdr);
4012                         b_asize = hdr->b_l2hdr.b_asize;
4013                         /*
4014                          * Lock out device removal.
4015                          */
4016                         if (vdev_is_dead(vd) ||
4017                             !spa_config_tryenter(spa, SCL_L2ARC, vd, RW_READER))
4018                                 vd = NULL;
4019                 }
4020
4021                 if (hash_lock != NULL)
4022                         mutex_exit(hash_lock);
4023
4024                 /*
4025                  * At this point, we have a level 1 cache miss.  Try again in
4026                  * L2ARC if possible.
4027                  */
4028                 ASSERT3U(hdr->b_size, ==, size);
4029                 DTRACE_PROBE4(arc__miss, arc_buf_hdr_t *, hdr, blkptr_t *, bp,
4030                     uint64_t, size, zbookmark_phys_t *, zb);
4031                 ARCSTAT_BUMP(arcstat_misses);
4032                 ARCSTAT_CONDSTAT(!HDR_PREFETCH(hdr),
4033                     demand, prefetch, !HDR_ISTYPE_METADATA(hdr),
4034                     data, metadata, misses);
4035
4036                 if (vd != NULL && l2arc_ndev != 0 && !(l2arc_norw && devw)) {
4037                         /*
4038                          * Read from the L2ARC if the following are true:
4039                          * 1. The L2ARC vdev was previously cached.
4040                          * 2. This buffer still has L2ARC metadata.
4041                          * 3. This buffer isn't currently writing to the L2ARC.
4042                          * 4. The L2ARC entry wasn't evicted, which may
4043                          *    also have invalidated the vdev.
4044                          * 5. This isn't prefetch and l2arc_noprefetch is set.
4045                          */
4046                         if (HDR_HAS_L2HDR(hdr) &&
4047                             !HDR_L2_WRITING(hdr) && !HDR_L2_EVICTED(hdr) &&
4048                             !(l2arc_noprefetch && HDR_PREFETCH(hdr))) {
4049                                 l2arc_read_callback_t *cb;
4050
4051                                 DTRACE_PROBE1(l2arc__hit, arc_buf_hdr_t *, hdr);
4052                                 ARCSTAT_BUMP(arcstat_l2_hits);
4053                                 atomic_inc_32(&hdr->b_l2hdr.b_hits);
4054
4055                                 cb = kmem_zalloc(sizeof (l2arc_read_callback_t),
4056                                     KM_SLEEP);
4057                                 cb->l2rcb_buf = buf;
4058                                 cb->l2rcb_spa = spa;
4059                                 cb->l2rcb_bp = *bp;
4060                                 cb->l2rcb_zb = *zb;
4061                                 cb->l2rcb_flags = zio_flags;
4062                                 cb->l2rcb_compress = b_compress;
4063
4064                                 ASSERT(addr >= VDEV_LABEL_START_SIZE &&
4065                                     addr + size < vd->vdev_psize -
4066                                     VDEV_LABEL_END_SIZE);
4067
4068                                 /*
4069                                  * l2arc read.  The SCL_L2ARC lock will be
4070                                  * released by l2arc_read_done().
4071                                  * Issue a null zio if the underlying buffer
4072                                  * was squashed to zero size by compression.
4073                                  */
4074                                 if (b_compress == ZIO_COMPRESS_EMPTY) {
4075                                         rzio = zio_null(pio, spa, vd,
4076                                             l2arc_read_done, cb,
4077                                             zio_flags | ZIO_FLAG_DONT_CACHE |
4078                                             ZIO_FLAG_CANFAIL |
4079                                             ZIO_FLAG_DONT_PROPAGATE |
4080                                             ZIO_FLAG_DONT_RETRY);
4081                                 } else {
4082                                         rzio = zio_read_phys(pio, vd, addr,
4083                                             b_asize, buf->b_data,
4084                                             ZIO_CHECKSUM_OFF,
4085                                             l2arc_read_done, cb, priority,
4086                                             zio_flags | ZIO_FLAG_DONT_CACHE |
4087                                             ZIO_FLAG_CANFAIL |
4088                                             ZIO_FLAG_DONT_PROPAGATE |
4089                                             ZIO_FLAG_DONT_RETRY, B_FALSE);
4090                                 }
4091                                 DTRACE_PROBE2(l2arc__read, vdev_t *, vd,
4092                                     zio_t *, rzio);
4093                                 ARCSTAT_INCR(arcstat_l2_read_bytes, b_asize);
4094
4095                                 if (*arc_flags & ARC_FLAG_NOWAIT) {
4096                                         zio_nowait(rzio);
4097                                         goto out;
4098                                 }
4099
4100                                 ASSERT(*arc_flags & ARC_FLAG_WAIT);
4101                                 if (zio_wait(rzio) == 0)
4102                                         goto out;
4103
4104                                 /* l2arc read error; goto zio_read() */
4105                         } else {
4106                                 DTRACE_PROBE1(l2arc__miss,
4107                                     arc_buf_hdr_t *, hdr);
4108                                 ARCSTAT_BUMP(arcstat_l2_misses);
4109                                 if (HDR_L2_WRITING(hdr))
4110                                         ARCSTAT_BUMP(arcstat_l2_rw_clash);
4111                                 spa_config_exit(spa, SCL_L2ARC, vd);
4112                         }
4113                 } else {
4114                         if (vd != NULL)
4115                                 spa_config_exit(spa, SCL_L2ARC, vd);
4116                         if (l2arc_ndev != 0) {
4117                                 DTRACE_PROBE1(l2arc__miss,
4118                                     arc_buf_hdr_t *, hdr);
4119                                 ARCSTAT_BUMP(arcstat_l2_misses);
4120                         }
4121                 }
4122
4123                 rzio = zio_read(pio, spa, bp, buf->b_data, size,
4124                     arc_read_done, buf, priority, zio_flags, zb);
4125
4126                 if (*arc_flags & ARC_FLAG_WAIT) {
4127                         rc = zio_wait(rzio);
4128                         goto out;
4129                 }
4130
4131                 ASSERT(*arc_flags & ARC_FLAG_NOWAIT);
4132                 zio_nowait(rzio);
4133         }
4134
4135 out:
4136         spa_read_history_add(spa, zb, *arc_flags);
4137         return (rc);
4138 }
4139
4140 arc_prune_t *
4141 arc_add_prune_callback(arc_prune_func_t *func, void *private)
4142 {
4143         arc_prune_t *p;
4144
4145         p = kmem_alloc(sizeof (*p), KM_SLEEP);
4146         p->p_pfunc = func;
4147         p->p_private = private;
4148         list_link_init(&p->p_node);
4149         refcount_create(&p->p_refcnt);
4150
4151         mutex_enter(&arc_prune_mtx);
4152         refcount_add(&p->p_refcnt, &arc_prune_list);
4153         list_insert_head(&arc_prune_list, p);
4154         mutex_exit(&arc_prune_mtx);
4155
4156         return (p);
4157 }
4158
4159 void
4160 arc_remove_prune_callback(arc_prune_t *p)
4161 {
4162         mutex_enter(&arc_prune_mtx);
4163         list_remove(&arc_prune_list, p);
4164         if (refcount_remove(&p->p_refcnt, &arc_prune_list) == 0) {
4165                 refcount_destroy(&p->p_refcnt);
4166                 kmem_free(p, sizeof (*p));
4167         }
4168         mutex_exit(&arc_prune_mtx);
4169 }
4170
4171 void
4172 arc_set_callback(arc_buf_t *buf, arc_evict_func_t *func, void *private)
4173 {
4174         ASSERT(buf->b_hdr != NULL);
4175         ASSERT(buf->b_hdr->b_l1hdr.b_state != arc_anon);
4176         ASSERT(!refcount_is_zero(&buf->b_hdr->b_l1hdr.b_refcnt) ||
4177             func == NULL);
4178         ASSERT(buf->b_efunc == NULL);
4179         ASSERT(!HDR_BUF_AVAILABLE(buf->b_hdr));
4180
4181         buf->b_efunc = func;
4182         buf->b_private = private;
4183 }
4184
4185 /*
4186  * Notify the arc that a block was freed, and thus will never be used again.
4187  */
4188 void
4189 arc_freed(spa_t *spa, const blkptr_t *bp)
4190 {
4191         arc_buf_hdr_t *hdr;
4192         kmutex_t *hash_lock;
4193         uint64_t guid = spa_load_guid(spa);
4194
4195         ASSERT(!BP_IS_EMBEDDED(bp));
4196
4197         hdr = buf_hash_find(guid, bp, &hash_lock);
4198         if (hdr == NULL)
4199                 return;
4200         if (HDR_BUF_AVAILABLE(hdr)) {
4201                 arc_buf_t *buf = hdr->b_l1hdr.b_buf;
4202                 add_reference(hdr, hash_lock, FTAG);
4203                 hdr->b_flags &= ~ARC_FLAG_BUF_AVAILABLE;
4204                 mutex_exit(hash_lock);
4205
4206                 arc_release(buf, FTAG);
4207                 (void) arc_buf_remove_ref(buf, FTAG);
4208         } else {
4209                 mutex_exit(hash_lock);
4210         }
4211
4212 }
4213
4214 /*
4215  * Clear the user eviction callback set by arc_set_callback(), first calling
4216  * it if it exists.  Because the presence of a callback keeps an arc_buf cached
4217  * clearing the callback may result in the arc_buf being destroyed.  However,
4218  * it will not result in the *last* arc_buf being destroyed, hence the data
4219  * will remain cached in the ARC. We make a copy of the arc buffer here so
4220  * that we can process the callback without holding any locks.
4221  *
4222  * It's possible that the callback is already in the process of being cleared
4223  * by another thread.  In this case we can not clear the callback.
4224  *
4225  * Returns B_TRUE if the callback was successfully called and cleared.
4226  */
4227 boolean_t
4228 arc_clear_callback(arc_buf_t *buf)
4229 {
4230         arc_buf_hdr_t *hdr;
4231         kmutex_t *hash_lock;
4232         arc_evict_func_t *efunc = buf->b_efunc;
4233         void *private = buf->b_private;
4234
4235         mutex_enter(&buf->b_evict_lock);
4236         hdr = buf->b_hdr;
4237         if (hdr == NULL) {
4238                 /*
4239                  * We are in arc_do_user_evicts().
4240                  */
4241                 ASSERT(buf->b_data == NULL);
4242                 mutex_exit(&buf->b_evict_lock);
4243                 return (B_FALSE);
4244         } else if (buf->b_data == NULL) {
4245                 /*
4246                  * We are on the eviction list; process this buffer now
4247                  * but let arc_do_user_evicts() do the reaping.
4248                  */
4249                 buf->b_efunc = NULL;
4250                 mutex_exit(&buf->b_evict_lock);
4251                 VERIFY0(efunc(private));
4252                 return (B_TRUE);
4253         }
4254         hash_lock = HDR_LOCK(hdr);
4255         mutex_enter(hash_lock);
4256         hdr = buf->b_hdr;
4257         ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
4258
4259         ASSERT3U(refcount_count(&hdr->b_l1hdr.b_refcnt), <,
4260             hdr->b_l1hdr.b_datacnt);
4261         ASSERT(hdr->b_l1hdr.b_state == arc_mru ||
4262             hdr->b_l1hdr.b_state == arc_mfu);
4263
4264         buf->b_efunc = NULL;
4265         buf->b_private = NULL;
4266
4267         if (hdr->b_l1hdr.b_datacnt > 1) {
4268                 mutex_exit(&buf->b_evict_lock);
4269                 arc_buf_destroy(buf, TRUE);
4270         } else {
4271                 ASSERT(buf == hdr->b_l1hdr.b_buf);
4272                 hdr->b_flags |= ARC_FLAG_BUF_AVAILABLE;
4273                 mutex_exit(&buf->b_evict_lock);
4274         }
4275
4276         mutex_exit(hash_lock);
4277         VERIFY0(efunc(private));
4278         return (B_TRUE);
4279 }
4280
4281 /*
4282  * Release this buffer from the cache, making it an anonymous buffer.  This
4283  * must be done after a read and prior to modifying the buffer contents.
4284  * If the buffer has more than one reference, we must make
4285  * a new hdr for the buffer.
4286  */
4287 void
4288 arc_release(arc_buf_t *buf, void *tag)
4289 {
4290         kmutex_t *hash_lock;
4291         arc_state_t *state;
4292         arc_buf_hdr_t *hdr = buf->b_hdr;
4293
4294         /*
4295          * It would be nice to assert that if its DMU metadata (level >
4296          * 0 || it's the dnode file), then it must be syncing context.
4297          * But we don't know that information at this level.
4298          */
4299
4300         mutex_enter(&buf->b_evict_lock);
4301
4302         ASSERT(HDR_HAS_L1HDR(hdr));
4303
4304         /*
4305          * We don't grab the hash lock prior to this check, because if
4306          * the buffer's header is in the arc_anon state, it won't be
4307          * linked into the hash table.
4308          */
4309         if (hdr->b_l1hdr.b_state == arc_anon) {
4310                 mutex_exit(&buf->b_evict_lock);
4311                 ASSERT(!HDR_IO_IN_PROGRESS(hdr));
4312                 ASSERT(!HDR_IN_HASH_TABLE(hdr));
4313                 ASSERT(!HDR_HAS_L2HDR(hdr));
4314                 ASSERT(BUF_EMPTY(hdr));
4315
4316                 ASSERT3U(hdr->b_l1hdr.b_datacnt, ==, 1);
4317                 ASSERT3S(refcount_count(&hdr->b_l1hdr.b_refcnt), ==, 1);
4318                 ASSERT(!list_link_active(&hdr->b_l1hdr.b_arc_node));
4319
4320                 ASSERT3P(buf->b_efunc, ==, NULL);
4321                 ASSERT3P(buf->b_private, ==, NULL);
4322
4323                 hdr->b_l1hdr.b_arc_access = 0;
4324                 arc_buf_thaw(buf);
4325
4326                 return;
4327         }
4328
4329         hash_lock = HDR_LOCK(hdr);
4330         mutex_enter(hash_lock);
4331
4332         /*
4333          * This assignment is only valid as long as the hash_lock is
4334          * held, we must be careful not to reference state or the
4335          * b_state field after dropping the lock.
4336          */
4337         state = hdr->b_l1hdr.b_state;
4338         ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
4339         ASSERT3P(state, !=, arc_anon);
4340
4341         /* this buffer is not on any list */
4342         ASSERT(refcount_count(&hdr->b_l1hdr.b_refcnt) > 0);
4343
4344         if (HDR_HAS_L2HDR(hdr)) {
4345                 ARCSTAT_INCR(arcstat_l2_asize, -hdr->b_l2hdr.b_asize);
4346                 ARCSTAT_INCR(arcstat_l2_size, -hdr->b_size);
4347
4348                 mutex_enter(&hdr->b_l2hdr.b_dev->l2ad_mtx);
4349                 list_remove(&hdr->b_l2hdr.b_dev->l2ad_buflist, hdr);
4350
4351                 /*
4352                  * We don't want to leak the b_tmp_cdata buffer that was
4353                  * allocated in l2arc_write_buffers()
4354                  */
4355                 arc_buf_l2_cdata_free(hdr);
4356
4357                 mutex_exit(&hdr->b_l2hdr.b_dev->l2ad_mtx);
4358
4359                 hdr->b_flags &= ~ARC_FLAG_HAS_L2HDR;
4360         }
4361
4362         /*
4363          * Do we have more than one buf?
4364          */
4365         if (hdr->b_l1hdr.b_datacnt > 1) {
4366                 arc_buf_hdr_t *nhdr;
4367                 arc_buf_t **bufp;
4368                 uint64_t blksz = hdr->b_size;
4369                 uint64_t spa = hdr->b_spa;
4370                 arc_buf_contents_t type = arc_buf_type(hdr);
4371                 uint32_t flags = hdr->b_flags;
4372
4373                 ASSERT(hdr->b_l1hdr.b_buf != buf || buf->b_next != NULL);
4374                 /*
4375                  * Pull the data off of this hdr and attach it to
4376                  * a new anonymous hdr.
4377                  */
4378                 (void) remove_reference(hdr, hash_lock, tag);
4379                 bufp = &hdr->b_l1hdr.b_buf;
4380                 while (*bufp != buf)
4381                         bufp = &(*bufp)->b_next;
4382                 *bufp = buf->b_next;
4383                 buf->b_next = NULL;
4384
4385                 ASSERT3P(state, !=, arc_l2c_only);
4386                 ASSERT3U(state->arcs_size, >=, hdr->b_size);
4387                 atomic_add_64(&state->arcs_size, -hdr->b_size);
4388                 if (refcount_is_zero(&hdr->b_l1hdr.b_refcnt)) {
4389                         uint64_t *size;
4390
4391                         ASSERT3P(state, !=, arc_l2c_only);
4392                         size = &state->arcs_lsize[type];
4393                         ASSERT3U(*size, >=, hdr->b_size);
4394                         atomic_add_64(size, -hdr->b_size);
4395                 }
4396
4397                 /*
4398                  * We're releasing a duplicate user data buffer, update
4399                  * our statistics accordingly.
4400                  */
4401                 if (HDR_ISTYPE_DATA(hdr)) {
4402                         ARCSTAT_BUMPDOWN(arcstat_duplicate_buffers);
4403                         ARCSTAT_INCR(arcstat_duplicate_buffers_size,
4404                             -hdr->b_size);
4405                 }
4406                 hdr->b_l1hdr.b_datacnt -= 1;
4407                 arc_cksum_verify(buf);
4408                 arc_buf_unwatch(buf);
4409
4410                 mutex_exit(hash_lock);
4411
4412                 nhdr = kmem_cache_alloc(hdr_full_cache, KM_PUSHPAGE);
4413                 nhdr->b_size = blksz;
4414                 nhdr->b_spa = spa;
4415
4416                 nhdr->b_l1hdr.b_mru_hits = 0;
4417                 nhdr->b_l1hdr.b_mru_ghost_hits = 0;
4418                 nhdr->b_l1hdr.b_mfu_hits = 0;
4419                 nhdr->b_l1hdr.b_mfu_ghost_hits = 0;
4420                 nhdr->b_l1hdr.b_l2_hits = 0;
4421                 nhdr->b_flags = flags & ARC_FLAG_L2_WRITING;
4422                 nhdr->b_flags |= arc_bufc_to_flags(type);
4423                 nhdr->b_flags |= ARC_FLAG_HAS_L1HDR;
4424
4425                 nhdr->b_l1hdr.b_buf = buf;
4426                 nhdr->b_l1hdr.b_datacnt = 1;
4427                 nhdr->b_l1hdr.b_state = arc_anon;
4428                 nhdr->b_l1hdr.b_arc_access = 0;
4429                 nhdr->b_l1hdr.b_tmp_cdata = NULL;
4430                 nhdr->b_freeze_cksum = NULL;
4431
4432                 (void) refcount_add(&nhdr->b_l1hdr.b_refcnt, tag);
4433                 buf->b_hdr = nhdr;
4434                 mutex_exit(&buf->b_evict_lock);
4435                 atomic_add_64(&arc_anon->arcs_size, blksz);
4436         } else {
4437                 mutex_exit(&buf->b_evict_lock);
4438                 ASSERT(refcount_count(&hdr->b_l1hdr.b_refcnt) == 1);
4439                 /* protected by hash lock, or hdr is on arc_anon */
4440                 ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node));
4441                 ASSERT(!HDR_IO_IN_PROGRESS(hdr));
4442                 hdr->b_l1hdr.b_mru_hits = 0;
4443                 hdr->b_l1hdr.b_mru_ghost_hits = 0;
4444                 hdr->b_l1hdr.b_mfu_hits = 0;
4445                 hdr->b_l1hdr.b_mfu_ghost_hits = 0;
4446                 hdr->b_l1hdr.b_l2_hits = 0;
4447                 arc_change_state(arc_anon, hdr, hash_lock);
4448                 hdr->b_l1hdr.b_arc_access = 0;
4449                 mutex_exit(hash_lock);
4450
4451                 buf_discard_identity(hdr);
4452                 arc_buf_thaw(buf);
4453         }
4454         buf->b_efunc = NULL;
4455         buf->b_private = NULL;
4456 }
4457
4458 int
4459 arc_released(arc_buf_t *buf)
4460 {
4461         int released;
4462
4463         mutex_enter(&buf->b_evict_lock);
4464         released = (buf->b_data != NULL &&
4465             buf->b_hdr->b_l1hdr.b_state == arc_anon);
4466         mutex_exit(&buf->b_evict_lock);
4467         return (released);
4468 }
4469
4470 #ifdef ZFS_DEBUG
4471 int
4472 arc_referenced(arc_buf_t *buf)
4473 {
4474         int referenced;
4475
4476         mutex_enter(&buf->b_evict_lock);
4477         referenced = (refcount_count(&buf->b_hdr->b_l1hdr.b_refcnt));
4478         mutex_exit(&buf->b_evict_lock);
4479         return (referenced);
4480 }
4481 #endif
4482
4483 static void
4484 arc_write_ready(zio_t *zio)
4485 {
4486         arc_write_callback_t *callback = zio->io_private;
4487         arc_buf_t *buf = callback->awcb_buf;
4488         arc_buf_hdr_t *hdr = buf->b_hdr;
4489
4490         ASSERT(HDR_HAS_L1HDR(hdr));
4491         ASSERT(!refcount_is_zero(&buf->b_hdr->b_l1hdr.b_refcnt));
4492         ASSERT(hdr->b_l1hdr.b_datacnt > 0);
4493         callback->awcb_ready(zio, buf, callback->awcb_private);
4494
4495         /*
4496          * If the IO is already in progress, then this is a re-write
4497          * attempt, so we need to thaw and re-compute the cksum.
4498          * It is the responsibility of the callback to handle the
4499          * accounting for any re-write attempt.
4500          */
4501         if (HDR_IO_IN_PROGRESS(hdr)) {
4502                 mutex_enter(&hdr->b_l1hdr.b_freeze_lock);
4503                 if (hdr->b_freeze_cksum != NULL) {
4504                         kmem_free(hdr->b_freeze_cksum, sizeof (zio_cksum_t));
4505                         hdr->b_freeze_cksum = NULL;
4506                 }
4507                 mutex_exit(&hdr->b_l1hdr.b_freeze_lock);
4508         }
4509         arc_cksum_compute(buf, B_FALSE);
4510         hdr->b_flags |= ARC_FLAG_IO_IN_PROGRESS;
4511 }
4512
4513 /*
4514  * The SPA calls this callback for each physical write that happens on behalf
4515  * of a logical write.  See the comment in dbuf_write_physdone() for details.
4516  */
4517 static void
4518 arc_write_physdone(zio_t *zio)
4519 {
4520         arc_write_callback_t *cb = zio->io_private;
4521         if (cb->awcb_physdone != NULL)
4522                 cb->awcb_physdone(zio, cb->awcb_buf, cb->awcb_private);
4523 }
4524
4525 static void
4526 arc_write_done(zio_t *zio)
4527 {
4528         arc_write_callback_t *callback = zio->io_private;
4529         arc_buf_t *buf = callback->awcb_buf;
4530         arc_buf_hdr_t *hdr = buf->b_hdr;
4531
4532         ASSERT(hdr->b_l1hdr.b_acb == NULL);
4533
4534         if (zio->io_error == 0) {
4535                 if (BP_IS_HOLE(zio->io_bp) || BP_IS_EMBEDDED(zio->io_bp)) {
4536                         buf_discard_identity(hdr);
4537                 } else {
4538                         hdr->b_dva = *BP_IDENTITY(zio->io_bp);
4539                         hdr->b_birth = BP_PHYSICAL_BIRTH(zio->io_bp);
4540                 }
4541         } else {
4542                 ASSERT(BUF_EMPTY(hdr));
4543         }
4544
4545         /*
4546          * If the block to be written was all-zero or compressed enough to be
4547          * embedded in the BP, no write was performed so there will be no
4548          * dva/birth/checksum.  The buffer must therefore remain anonymous
4549          * (and uncached).
4550          */
4551         if (!BUF_EMPTY(hdr)) {
4552                 arc_buf_hdr_t *exists;
4553                 kmutex_t *hash_lock;
4554
4555                 ASSERT(zio->io_error == 0);
4556
4557                 arc_cksum_verify(buf);
4558
4559                 exists = buf_hash_insert(hdr, &hash_lock);
4560                 if (exists != NULL) {
4561                         /*
4562                          * This can only happen if we overwrite for
4563                          * sync-to-convergence, because we remove
4564                          * buffers from the hash table when we arc_free().
4565                          */
4566                         if (zio->io_flags & ZIO_FLAG_IO_REWRITE) {
4567                                 if (!BP_EQUAL(&zio->io_bp_orig, zio->io_bp))
4568                                         panic("bad overwrite, hdr=%p exists=%p",
4569                                             (void *)hdr, (void *)exists);
4570                                 ASSERT(refcount_is_zero(
4571                                     &exists->b_l1hdr.b_refcnt));
4572                                 arc_change_state(arc_anon, exists, hash_lock);
4573                                 mutex_exit(hash_lock);
4574                                 arc_hdr_destroy(exists);
4575                                 exists = buf_hash_insert(hdr, &hash_lock);
4576                                 ASSERT3P(exists, ==, NULL);
4577                         } else if (zio->io_flags & ZIO_FLAG_NOPWRITE) {
4578                                 /* nopwrite */
4579                                 ASSERT(zio->io_prop.zp_nopwrite);
4580                                 if (!BP_EQUAL(&zio->io_bp_orig, zio->io_bp))
4581                                         panic("bad nopwrite, hdr=%p exists=%p",
4582                                             (void *)hdr, (void *)exists);
4583                         } else {
4584                                 /* Dedup */
4585                                 ASSERT(hdr->b_l1hdr.b_datacnt == 1);
4586                                 ASSERT(hdr->b_l1hdr.b_state == arc_anon);
4587                                 ASSERT(BP_GET_DEDUP(zio->io_bp));
4588                                 ASSERT(BP_GET_LEVEL(zio->io_bp) == 0);
4589                         }
4590                 }
4591                 hdr->b_flags &= ~ARC_FLAG_IO_IN_PROGRESS;
4592                 /* if it's not anon, we are doing a scrub */
4593                 if (exists == NULL && hdr->b_l1hdr.b_state == arc_anon)
4594                         arc_access(hdr, hash_lock);
4595                 mutex_exit(hash_lock);
4596         } else {
4597                 hdr->b_flags &= ~ARC_FLAG_IO_IN_PROGRESS;
4598         }
4599
4600         ASSERT(!refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
4601         callback->awcb_done(zio, buf, callback->awcb_private);
4602
4603         kmem_free(callback, sizeof (arc_write_callback_t));
4604 }
4605
4606 zio_t *
4607 arc_write(zio_t *pio, spa_t *spa, uint64_t txg,
4608     blkptr_t *bp, arc_buf_t *buf, boolean_t l2arc, boolean_t l2arc_compress,
4609     const zio_prop_t *zp, arc_done_func_t *ready, arc_done_func_t *physdone,
4610     arc_done_func_t *done, void *private, zio_priority_t priority,
4611     int zio_flags, const zbookmark_phys_t *zb)
4612 {
4613         arc_buf_hdr_t *hdr = buf->b_hdr;
4614         arc_write_callback_t *callback;
4615         zio_t *zio;
4616
4617         ASSERT(ready != NULL);
4618         ASSERT(done != NULL);
4619         ASSERT(!HDR_IO_ERROR(hdr));
4620         ASSERT(!HDR_IO_IN_PROGRESS(hdr));
4621         ASSERT(hdr->b_l1hdr.b_acb == NULL);
4622         ASSERT(hdr->b_l1hdr.b_datacnt > 0);
4623         if (l2arc)
4624                 hdr->b_flags |= ARC_FLAG_L2CACHE;
4625         if (l2arc_compress)
4626                 hdr->b_flags |= ARC_FLAG_L2COMPRESS;
4627         callback = kmem_zalloc(sizeof (arc_write_callback_t), KM_SLEEP);
4628         callback->awcb_ready = ready;
4629         callback->awcb_physdone = physdone;
4630         callback->awcb_done = done;
4631         callback->awcb_private = private;
4632         callback->awcb_buf = buf;
4633
4634         zio = zio_write(pio, spa, txg, bp, buf->b_data, hdr->b_size, zp,
4635             arc_write_ready, arc_write_physdone, arc_write_done, callback,
4636             priority, zio_flags, zb);
4637
4638         return (zio);
4639 }
4640
4641 static int
4642 arc_memory_throttle(uint64_t reserve, uint64_t txg)
4643 {
4644 #ifdef _KERNEL
4645         if (zfs_arc_memory_throttle_disable)
4646                 return (0);
4647
4648         if (freemem <= physmem * arc_lotsfree_percent / 100) {
4649                 ARCSTAT_INCR(arcstat_memory_throttle_count, 1);
4650                 DMU_TX_STAT_BUMP(dmu_tx_memory_reclaim);
4651                 return (SET_ERROR(EAGAIN));
4652         }
4653 #endif
4654         return (0);
4655 }
4656
4657 void
4658 arc_tempreserve_clear(uint64_t reserve)
4659 {
4660         atomic_add_64(&arc_tempreserve, -reserve);
4661         ASSERT((int64_t)arc_tempreserve >= 0);
4662 }
4663
4664 int
4665 arc_tempreserve_space(uint64_t reserve, uint64_t txg)
4666 {
4667         int error;
4668         uint64_t anon_size;
4669
4670         if (reserve > arc_c/4 && !arc_no_grow)
4671                 arc_c = MIN(arc_c_max, reserve * 4);
4672
4673         /*
4674          * Throttle when the calculated memory footprint for the TXG
4675          * exceeds the target ARC size.
4676          */
4677         if (reserve > arc_c) {
4678                 DMU_TX_STAT_BUMP(dmu_tx_memory_reserve);
4679                 return (SET_ERROR(ERESTART));
4680         }
4681
4682         /*
4683          * Don't count loaned bufs as in flight dirty data to prevent long
4684          * network delays from blocking transactions that are ready to be
4685          * assigned to a txg.
4686          */
4687         anon_size = MAX((int64_t)(arc_anon->arcs_size - arc_loaned_bytes), 0);
4688
4689         /*
4690          * Writes will, almost always, require additional memory allocations
4691          * in order to compress/encrypt/etc the data.  We therefore need to
4692          * make sure that there is sufficient available memory for this.
4693          */
4694         error = arc_memory_throttle(reserve, txg);
4695         if (error != 0)
4696                 return (error);
4697
4698         /*
4699          * Throttle writes when the amount of dirty data in the cache
4700          * gets too large.  We try to keep the cache less than half full
4701          * of dirty blocks so that our sync times don't grow too large.
4702          * Note: if two requests come in concurrently, we might let them
4703          * both succeed, when one of them should fail.  Not a huge deal.
4704          */
4705
4706         if (reserve + arc_tempreserve + anon_size > arc_c / 2 &&
4707             anon_size > arc_c / 4) {
4708                 dprintf("failing, arc_tempreserve=%lluK anon_meta=%lluK "
4709                     "anon_data=%lluK tempreserve=%lluK arc_c=%lluK\n",
4710                     arc_tempreserve>>10,
4711                     arc_anon->arcs_lsize[ARC_BUFC_METADATA]>>10,
4712                     arc_anon->arcs_lsize[ARC_BUFC_DATA]>>10,
4713                     reserve>>10, arc_c>>10);
4714                 DMU_TX_STAT_BUMP(dmu_tx_dirty_throttle);
4715                 return (SET_ERROR(ERESTART));
4716         }
4717         atomic_add_64(&arc_tempreserve, reserve);
4718         return (0);
4719 }
4720
4721 static void
4722 arc_kstat_update_state(arc_state_t *state, kstat_named_t *size,
4723     kstat_named_t *evict_data, kstat_named_t *evict_metadata)
4724 {
4725         size->value.ui64 = state->arcs_size;
4726         evict_data->value.ui64 = state->arcs_lsize[ARC_BUFC_DATA];
4727         evict_metadata->value.ui64 = state->arcs_lsize[ARC_BUFC_METADATA];
4728 }
4729
4730 static int
4731 arc_kstat_update(kstat_t *ksp, int rw)
4732 {
4733         arc_stats_t *as = ksp->ks_data;
4734
4735         if (rw == KSTAT_WRITE) {
4736                 return (SET_ERROR(EACCES));
4737         } else {
4738                 arc_kstat_update_state(arc_anon,
4739                     &as->arcstat_anon_size,
4740                     &as->arcstat_anon_evict_data,
4741                     &as->arcstat_anon_evict_metadata);
4742                 arc_kstat_update_state(arc_mru,
4743                     &as->arcstat_mru_size,
4744                     &as->arcstat_mru_evict_data,
4745                     &as->arcstat_mru_evict_metadata);
4746                 arc_kstat_update_state(arc_mru_ghost,
4747                     &as->arcstat_mru_ghost_size,
4748                     &as->arcstat_mru_ghost_evict_data,
4749                     &as->arcstat_mru_ghost_evict_metadata);
4750                 arc_kstat_update_state(arc_mfu,
4751                     &as->arcstat_mfu_size,
4752                     &as->arcstat_mfu_evict_data,
4753                     &as->arcstat_mfu_evict_metadata);
4754                 arc_kstat_update_state(arc_mfu_ghost,
4755                     &as->arcstat_mfu_ghost_size,
4756                     &as->arcstat_mfu_ghost_evict_data,
4757                     &as->arcstat_mfu_ghost_evict_metadata);
4758         }
4759
4760         return (0);
4761 }
4762
4763 /*
4764  * This function *must* return indices evenly distributed between all
4765  * sublists of the multilist. This is needed due to how the ARC eviction
4766  * code is laid out; arc_evict_state() assumes ARC buffers are evenly
4767  * distributed between all sublists and uses this assumption when
4768  * deciding which sublist to evict from and how much to evict from it.
4769  */
4770 unsigned int
4771 arc_state_multilist_index_func(multilist_t *ml, void *obj)
4772 {
4773         arc_buf_hdr_t *hdr = obj;
4774
4775         /*
4776          * We rely on b_dva to generate evenly distributed index
4777          * numbers using buf_hash below. So, as an added precaution,
4778          * let's make sure we never add empty buffers to the arc lists.
4779          */
4780         ASSERT(!BUF_EMPTY(hdr));
4781
4782         /*
4783          * The assumption here, is the hash value for a given
4784          * arc_buf_hdr_t will remain constant throughout its lifetime
4785          * (i.e. its b_spa, b_dva, and b_birth fields don't change).
4786          * Thus, we don't need to store the header's sublist index
4787          * on insertion, as this index can be recalculated on removal.
4788          *
4789          * Also, the low order bits of the hash value are thought to be
4790          * distributed evenly. Otherwise, in the case that the multilist
4791          * has a power of two number of sublists, each sublists' usage
4792          * would not be evenly distributed.
4793          */
4794         return (buf_hash(hdr->b_spa, &hdr->b_dva, hdr->b_birth) %
4795             multilist_get_num_sublists(ml));
4796 }
4797
4798 void
4799 arc_init(void)
4800 {
4801         mutex_init(&arc_reclaim_lock, NULL, MUTEX_DEFAULT, NULL);
4802         cv_init(&arc_reclaim_thread_cv, NULL, CV_DEFAULT, NULL);
4803         cv_init(&arc_reclaim_waiters_cv, NULL, CV_DEFAULT, NULL);
4804
4805         mutex_init(&arc_user_evicts_lock, NULL, MUTEX_DEFAULT, NULL);
4806         cv_init(&arc_user_evicts_cv, NULL, CV_DEFAULT, NULL);
4807
4808         /* Convert seconds to clock ticks */
4809         zfs_arc_min_prefetch_lifespan = 1 * hz;
4810
4811         /* Start out with 1/8 of all memory */
4812         arc_c = physmem * PAGESIZE / 8;
4813
4814 #ifdef _KERNEL
4815         /*
4816          * On architectures where the physical memory can be larger
4817          * than the addressable space (intel in 32-bit mode), we may
4818          * need to limit the cache to 1/8 of VM size.
4819          */
4820         arc_c = MIN(arc_c, vmem_size(heap_arena, VMEM_ALLOC | VMEM_FREE) / 8);
4821         /*
4822          * Register a shrinker to support synchronous (direct) memory
4823          * reclaim from the arc.  This is done to prevent kswapd from
4824          * swapping out pages when it is preferable to shrink the arc.
4825          */
4826         spl_register_shrinker(&arc_shrinker);
4827 #endif
4828
4829         /* set min cache to zero */
4830         arc_c_min = 4<<20;
4831         /* set max to 1/2 of all memory */
4832         arc_c_max = arc_c * 4;
4833
4834         /*
4835          * Allow the tunables to override our calculations if they are
4836          * reasonable (ie. over 64MB)
4837          */
4838         if (zfs_arc_max > 64<<20 && zfs_arc_max < physmem * PAGESIZE)
4839                 arc_c_max = zfs_arc_max;
4840         if (zfs_arc_min > 0 && zfs_arc_min <= arc_c_max)
4841                 arc_c_min = zfs_arc_min;
4842
4843         arc_c = arc_c_max;
4844         arc_p = (arc_c >> 1);
4845
4846         /* limit meta-data to 3/4 of the arc capacity */
4847         arc_meta_limit = (3 * arc_c_max) / 4;
4848         arc_meta_max = 0;
4849
4850         /* Allow the tunable to override if it is reasonable */
4851         if (zfs_arc_meta_limit > 0 && zfs_arc_meta_limit <= arc_c_max)
4852                 arc_meta_limit = zfs_arc_meta_limit;
4853
4854         if (zfs_arc_num_sublists_per_state < 1)
4855                 zfs_arc_num_sublists_per_state = num_online_cpus();
4856
4857         /* if kmem_flags are set, lets try to use less memory */
4858         if (kmem_debugging())
4859                 arc_c = arc_c / 2;
4860         if (arc_c < arc_c_min)
4861                 arc_c = arc_c_min;
4862
4863         arc_anon = &ARC_anon;
4864         arc_mru = &ARC_mru;
4865         arc_mru_ghost = &ARC_mru_ghost;
4866         arc_mfu = &ARC_mfu;
4867         arc_mfu_ghost = &ARC_mfu_ghost;
4868         arc_l2c_only = &ARC_l2c_only;
4869         arc_size = 0;
4870
4871         multilist_create(&arc_mru->arcs_list[ARC_BUFC_METADATA],
4872             sizeof (arc_buf_hdr_t),
4873             offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node),
4874             zfs_arc_num_sublists_per_state, arc_state_multilist_index_func);
4875         multilist_create(&arc_mru->arcs_list[ARC_BUFC_DATA],
4876             sizeof (arc_buf_hdr_t),
4877             offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node),
4878             zfs_arc_num_sublists_per_state, arc_state_multilist_index_func);
4879         multilist_create(&arc_mru_ghost->arcs_list[ARC_BUFC_METADATA],
4880             sizeof (arc_buf_hdr_t),
4881             offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node),
4882             zfs_arc_num_sublists_per_state, arc_state_multilist_index_func);
4883         multilist_create(&arc_mru_ghost->arcs_list[ARC_BUFC_DATA],
4884             sizeof (arc_buf_hdr_t),
4885             offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node),
4886             zfs_arc_num_sublists_per_state, arc_state_multilist_index_func);
4887         multilist_create(&arc_mfu->arcs_list[ARC_BUFC_METADATA],
4888             sizeof (arc_buf_hdr_t),
4889             offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node),
4890             zfs_arc_num_sublists_per_state, arc_state_multilist_index_func);
4891         multilist_create(&arc_mfu->arcs_list[ARC_BUFC_DATA],
4892             sizeof (arc_buf_hdr_t),
4893             offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node),
4894             zfs_arc_num_sublists_per_state, arc_state_multilist_index_func);
4895         multilist_create(&arc_mfu_ghost->arcs_list[ARC_BUFC_METADATA],
4896             sizeof (arc_buf_hdr_t),
4897             offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node),
4898             zfs_arc_num_sublists_per_state, arc_state_multilist_index_func);
4899         multilist_create(&arc_mfu_ghost->arcs_list[ARC_BUFC_DATA],
4900             sizeof (arc_buf_hdr_t),
4901             offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node),
4902             zfs_arc_num_sublists_per_state, arc_state_multilist_index_func);
4903         multilist_create(&arc_l2c_only->arcs_list[ARC_BUFC_METADATA],
4904             sizeof (arc_buf_hdr_t),
4905             offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node),
4906             zfs_arc_num_sublists_per_state, arc_state_multilist_index_func);
4907         multilist_create(&arc_l2c_only->arcs_list[ARC_BUFC_DATA],
4908             sizeof (arc_buf_hdr_t),
4909             offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node),
4910             zfs_arc_num_sublists_per_state, arc_state_multilist_index_func);
4911
4912         arc_anon->arcs_state = ARC_STATE_ANON;
4913         arc_mru->arcs_state = ARC_STATE_MRU;
4914         arc_mru_ghost->arcs_state = ARC_STATE_MRU_GHOST;
4915         arc_mfu->arcs_state = ARC_STATE_MFU;
4916         arc_mfu_ghost->arcs_state = ARC_STATE_MFU_GHOST;
4917         arc_l2c_only->arcs_state = ARC_STATE_L2C_ONLY;
4918
4919         buf_init();
4920
4921         arc_reclaim_thread_exit = FALSE;
4922         arc_user_evicts_thread_exit = FALSE;
4923         list_create(&arc_prune_list, sizeof (arc_prune_t),
4924             offsetof(arc_prune_t, p_node));
4925         arc_eviction_list = NULL;
4926         mutex_init(&arc_prune_mtx, NULL, MUTEX_DEFAULT, NULL);
4927         bzero(&arc_eviction_hdr, sizeof (arc_buf_hdr_t));
4928
4929         arc_prune_taskq = taskq_create("arc_prune", max_ncpus, minclsyspri,
4930             max_ncpus, INT_MAX, TASKQ_PREPOPULATE);
4931
4932         arc_ksp = kstat_create("zfs", 0, "arcstats", "misc", KSTAT_TYPE_NAMED,
4933             sizeof (arc_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL);
4934
4935         if (arc_ksp != NULL) {
4936                 arc_ksp->ks_data = &arc_stats;
4937                 arc_ksp->ks_update = arc_kstat_update;
4938                 kstat_install(arc_ksp);
4939         }
4940
4941         (void) thread_create(NULL, 0, arc_adapt_thread, NULL, 0, &p0,
4942             TS_RUN, minclsyspri);
4943
4944         (void) thread_create(NULL, 0, arc_user_evicts_thread, NULL, 0, &p0,
4945             TS_RUN, minclsyspri);
4946
4947         arc_dead = FALSE;
4948         arc_warm = B_FALSE;
4949
4950         /*
4951          * Calculate maximum amount of dirty data per pool.
4952          *
4953          * If it has been set by a module parameter, take that.
4954          * Otherwise, use a percentage of physical memory defined by
4955          * zfs_dirty_data_max_percent (default 10%) with a cap at
4956          * zfs_dirty_data_max_max (default 25% of physical memory).
4957          */
4958         if (zfs_dirty_data_max_max == 0)
4959                 zfs_dirty_data_max_max = physmem * PAGESIZE *
4960                     zfs_dirty_data_max_max_percent / 100;
4961
4962         if (zfs_dirty_data_max == 0) {
4963                 zfs_dirty_data_max = physmem * PAGESIZE *
4964                     zfs_dirty_data_max_percent / 100;
4965                 zfs_dirty_data_max = MIN(zfs_dirty_data_max,
4966                     zfs_dirty_data_max_max);
4967         }
4968 }
4969
4970 void
4971 arc_fini(void)
4972 {
4973         arc_prune_t *p;
4974
4975 #ifdef _KERNEL
4976         spl_unregister_shrinker(&arc_shrinker);
4977 #endif /* _KERNEL */
4978
4979         mutex_enter(&arc_reclaim_lock);
4980         arc_reclaim_thread_exit = TRUE;
4981         /*
4982          * The reclaim thread will set arc_reclaim_thread_exit back to
4983          * FALSE when it is finished exiting; we're waiting for that.
4984          */
4985         while (arc_reclaim_thread_exit) {
4986                 cv_signal(&arc_reclaim_thread_cv);
4987                 cv_wait(&arc_reclaim_thread_cv, &arc_reclaim_lock);
4988         }
4989         mutex_exit(&arc_reclaim_lock);
4990
4991         mutex_enter(&arc_user_evicts_lock);
4992         arc_user_evicts_thread_exit = TRUE;
4993         /*
4994          * The user evicts thread will set arc_user_evicts_thread_exit
4995          * to FALSE when it is finished exiting; we're waiting for that.
4996          */
4997         while (arc_user_evicts_thread_exit) {
4998                 cv_signal(&arc_user_evicts_cv);
4999                 cv_wait(&arc_user_evicts_cv, &arc_user_evicts_lock);
5000         }
5001         mutex_exit(&arc_user_evicts_lock);
5002
5003         /* Use TRUE to ensure *all* buffers are evicted */
5004         arc_flush(NULL, TRUE);
5005
5006         arc_dead = TRUE;
5007
5008         if (arc_ksp != NULL) {
5009                 kstat_delete(arc_ksp);
5010                 arc_ksp = NULL;
5011         }
5012
5013         taskq_wait(arc_prune_taskq);
5014         taskq_destroy(arc_prune_taskq);
5015
5016         mutex_enter(&arc_prune_mtx);
5017         while ((p = list_head(&arc_prune_list)) != NULL) {
5018                 list_remove(&arc_prune_list, p);
5019                 refcount_remove(&p->p_refcnt, &arc_prune_list);
5020                 refcount_destroy(&p->p_refcnt);
5021                 kmem_free(p, sizeof (*p));
5022         }
5023         mutex_exit(&arc_prune_mtx);
5024
5025         list_destroy(&arc_prune_list);
5026         mutex_destroy(&arc_prune_mtx);
5027         mutex_destroy(&arc_reclaim_lock);
5028         cv_destroy(&arc_reclaim_thread_cv);
5029         cv_destroy(&arc_reclaim_waiters_cv);
5030
5031         mutex_destroy(&arc_user_evicts_lock);
5032         cv_destroy(&arc_user_evicts_cv);
5033
5034         multilist_destroy(&arc_mru->arcs_list[ARC_BUFC_METADATA]);
5035         multilist_destroy(&arc_mru_ghost->arcs_list[ARC_BUFC_METADATA]);
5036         multilist_destroy(&arc_mfu->arcs_list[ARC_BUFC_METADATA]);
5037         multilist_destroy(&arc_mfu_ghost->arcs_list[ARC_BUFC_METADATA]);
5038         multilist_destroy(&arc_mru->arcs_list[ARC_BUFC_DATA]);
5039         multilist_destroy(&arc_mru_ghost->arcs_list[ARC_BUFC_DATA]);
5040         multilist_destroy(&arc_mfu->arcs_list[ARC_BUFC_DATA]);
5041         multilist_destroy(&arc_mfu_ghost->arcs_list[ARC_BUFC_DATA]);
5042         multilist_destroy(&arc_l2c_only->arcs_list[ARC_BUFC_METADATA]);
5043         multilist_destroy(&arc_l2c_only->arcs_list[ARC_BUFC_DATA]);
5044
5045         buf_fini();
5046
5047         ASSERT0(arc_loaned_bytes);
5048 }
5049
5050 /*
5051  * Level 2 ARC
5052  *
5053  * The level 2 ARC (L2ARC) is a cache layer in-between main memory and disk.
5054  * It uses dedicated storage devices to hold cached data, which are populated
5055  * using large infrequent writes.  The main role of this cache is to boost
5056  * the performance of random read workloads.  The intended L2ARC devices
5057  * include short-stroked disks, solid state disks, and other media with
5058  * substantially faster read latency than disk.
5059  *
5060  *                 +-----------------------+
5061  *                 |         ARC           |
5062  *                 +-----------------------+
5063  *                    |         ^     ^
5064  *                    |         |     |
5065  *      l2arc_feed_thread()    arc_read()
5066  *                    |         |     |
5067  *                    |  l2arc read   |
5068  *                    V         |     |
5069  *               +---------------+    |
5070  *               |     L2ARC     |    |
5071  *               +---------------+    |
5072  *                   |    ^           |
5073  *          l2arc_write() |           |
5074  *                   |    |           |
5075  *                   V    |           |
5076  *                 +-------+      +-------+
5077  *                 | vdev  |      | vdev  |
5078  *                 | cache |      | cache |
5079  *                 +-------+      +-------+
5080  *                 +=========+     .-----.
5081  *                 :  L2ARC  :    |-_____-|
5082  *                 : devices :    | Disks |
5083  *                 +=========+    `-_____-'
5084  *
5085  * Read requests are satisfied from the following sources, in order:
5086  *
5087  *      1) ARC
5088  *      2) vdev cache of L2ARC devices
5089  *      3) L2ARC devices
5090  *      4) vdev cache of disks
5091  *      5) disks
5092  *
5093  * Some L2ARC device types exhibit extremely slow write performance.
5094  * To accommodate for this there are some significant differences between
5095  * the L2ARC and traditional cache design:
5096  *
5097  * 1. There is no eviction path from the ARC to the L2ARC.  Evictions from
5098  * the ARC behave as usual, freeing buffers and placing headers on ghost
5099  * lists.  The ARC does not send buffers to the L2ARC during eviction as
5100  * this would add inflated write latencies for all ARC memory pressure.
5101  *
5102  * 2. The L2ARC attempts to cache data from the ARC before it is evicted.
5103  * It does this by periodically scanning buffers from the eviction-end of
5104  * the MFU and MRU ARC lists, copying them to the L2ARC devices if they are
5105  * not already there. It scans until a headroom of buffers is satisfied,
5106  * which itself is a buffer for ARC eviction. If a compressible buffer is
5107  * found during scanning and selected for writing to an L2ARC device, we
5108  * temporarily boost scanning headroom during the next scan cycle to make
5109  * sure we adapt to compression effects (which might significantly reduce
5110  * the data volume we write to L2ARC). The thread that does this is
5111  * l2arc_feed_thread(), illustrated below; example sizes are included to
5112  * provide a better sense of ratio than this diagram:
5113  *
5114  *             head -->                        tail
5115  *              +---------------------+----------+
5116  *      ARC_mfu |:::::#:::::::::::::::|o#o###o###|-->.   # already on L2ARC
5117  *              +---------------------+----------+   |   o L2ARC eligible
5118  *      ARC_mru |:#:::::::::::::::::::|#o#ooo####|-->|   : ARC buffer
5119  *              +---------------------+----------+   |
5120  *                   15.9 Gbytes      ^ 32 Mbytes    |
5121  *                                 headroom          |
5122  *                                            l2arc_feed_thread()
5123  *                                                   |
5124  *                       l2arc write hand <--[oooo]--'
5125  *                               |           8 Mbyte
5126  *                               |          write max
5127  *                               V
5128  *                +==============================+
5129  *      L2ARC dev |####|#|###|###|    |####| ... |
5130  *                +==============================+
5131  *                           32 Gbytes
5132  *
5133  * 3. If an ARC buffer is copied to the L2ARC but then hit instead of
5134  * evicted, then the L2ARC has cached a buffer much sooner than it probably
5135  * needed to, potentially wasting L2ARC device bandwidth and storage.  It is
5136  * safe to say that this is an uncommon case, since buffers at the end of
5137  * the ARC lists have moved there due to inactivity.
5138  *
5139  * 4. If the ARC evicts faster than the L2ARC can maintain a headroom,
5140  * then the L2ARC simply misses copying some buffers.  This serves as a
5141  * pressure valve to prevent heavy read workloads from both stalling the ARC
5142  * with waits and clogging the L2ARC with writes.  This also helps prevent
5143  * the potential for the L2ARC to churn if it attempts to cache content too
5144  * quickly, such as during backups of the entire pool.
5145  *
5146  * 5. After system boot and before the ARC has filled main memory, there are
5147  * no evictions from the ARC and so the tails of the ARC_mfu and ARC_mru
5148  * lists can remain mostly static.  Instead of searching from tail of these
5149  * lists as pictured, the l2arc_feed_thread() will search from the list heads
5150  * for eligible buffers, greatly increasing its chance of finding them.
5151  *
5152  * The L2ARC device write speed is also boosted during this time so that
5153  * the L2ARC warms up faster.  Since there have been no ARC evictions yet,
5154  * there are no L2ARC reads, and no fear of degrading read performance
5155  * through increased writes.
5156  *
5157  * 6. Writes to the L2ARC devices are grouped and sent in-sequence, so that
5158  * the vdev queue can aggregate them into larger and fewer writes.  Each
5159  * device is written to in a rotor fashion, sweeping writes through
5160  * available space then repeating.
5161  *
5162  * 7. The L2ARC does not store dirty content.  It never needs to flush
5163  * write buffers back to disk based storage.
5164  *
5165  * 8. If an ARC buffer is written (and dirtied) which also exists in the
5166  * L2ARC, the now stale L2ARC buffer is immediately dropped.
5167  *
5168  * The performance of the L2ARC can be tweaked by a number of tunables, which
5169  * may be necessary for different workloads:
5170  *
5171  *      l2arc_write_max         max write bytes per interval
5172  *      l2arc_write_boost       extra write bytes during device warmup
5173  *      l2arc_noprefetch        skip caching prefetched buffers
5174  *      l2arc_nocompress        skip compressing buffers
5175  *      l2arc_headroom          number of max device writes to precache
5176  *      l2arc_headroom_boost    when we find compressed buffers during ARC
5177  *                              scanning, we multiply headroom by this
5178  *                              percentage factor for the next scan cycle,
5179  *                              since more compressed buffers are likely to
5180  *                              be present
5181  *      l2arc_feed_secs         seconds between L2ARC writing
5182  *
5183  * Tunables may be removed or added as future performance improvements are
5184  * integrated, and also may become zpool properties.
5185  *
5186  * There are three key functions that control how the L2ARC warms up:
5187  *
5188  *      l2arc_write_eligible()  check if a buffer is eligible to cache
5189  *      l2arc_write_size()      calculate how much to write
5190  *      l2arc_write_interval()  calculate sleep delay between writes
5191  *
5192  * These three functions determine what to write, how much, and how quickly
5193  * to send writes.
5194  */
5195
5196 static boolean_t
5197 l2arc_write_eligible(uint64_t spa_guid, arc_buf_hdr_t *hdr)
5198 {
5199         /*
5200          * A buffer is *not* eligible for the L2ARC if it:
5201          * 1. belongs to a different spa.
5202          * 2. is already cached on the L2ARC.
5203          * 3. has an I/O in progress (it may be an incomplete read).
5204          * 4. is flagged not eligible (zfs property).
5205          */
5206         if (hdr->b_spa != spa_guid || HDR_HAS_L2HDR(hdr) ||
5207             HDR_IO_IN_PROGRESS(hdr) || !HDR_L2CACHE(hdr))
5208                 return (B_FALSE);
5209
5210         return (B_TRUE);
5211 }
5212
5213 static uint64_t
5214 l2arc_write_size(void)
5215 {
5216         uint64_t size;
5217
5218         /*
5219          * Make sure our globals have meaningful values in case the user
5220          * altered them.
5221          */
5222         size = l2arc_write_max;
5223         if (size == 0) {
5224                 cmn_err(CE_NOTE, "Bad value for l2arc_write_max, value must "
5225                     "be greater than zero, resetting it to the default (%d)",
5226                     L2ARC_WRITE_SIZE);
5227                 size = l2arc_write_max = L2ARC_WRITE_SIZE;
5228         }
5229
5230         if (arc_warm == B_FALSE)
5231                 size += l2arc_write_boost;
5232
5233         return (size);
5234
5235 }
5236
5237 static clock_t
5238 l2arc_write_interval(clock_t began, uint64_t wanted, uint64_t wrote)
5239 {
5240         clock_t interval, next, now;
5241
5242         /*
5243          * If the ARC lists are busy, increase our write rate; if the
5244          * lists are stale, idle back.  This is achieved by checking
5245          * how much we previously wrote - if it was more than half of
5246          * what we wanted, schedule the next write much sooner.
5247          */
5248         if (l2arc_feed_again && wrote > (wanted / 2))
5249                 interval = (hz * l2arc_feed_min_ms) / 1000;
5250         else
5251                 interval = hz * l2arc_feed_secs;
5252
5253         now = ddi_get_lbolt();
5254         next = MAX(now, MIN(now + interval, began + interval));
5255
5256         return (next);
5257 }
5258
5259 /*
5260  * Cycle through L2ARC devices.  This is how L2ARC load balances.
5261  * If a device is returned, this also returns holding the spa config lock.
5262  */
5263 static l2arc_dev_t *
5264 l2arc_dev_get_next(void)
5265 {
5266         l2arc_dev_t *first, *next = NULL;
5267
5268         /*
5269          * Lock out the removal of spas (spa_namespace_lock), then removal
5270          * of cache devices (l2arc_dev_mtx).  Once a device has been selected,
5271          * both locks will be dropped and a spa config lock held instead.
5272          */
5273         mutex_enter(&spa_namespace_lock);
5274         mutex_enter(&l2arc_dev_mtx);
5275
5276         /* if there are no vdevs, there is nothing to do */
5277         if (l2arc_ndev == 0)
5278                 goto out;
5279
5280         first = NULL;
5281         next = l2arc_dev_last;
5282         do {
5283                 /* loop around the list looking for a non-faulted vdev */
5284                 if (next == NULL) {
5285                         next = list_head(l2arc_dev_list);
5286                 } else {
5287                         next = list_next(l2arc_dev_list, next);
5288                         if (next == NULL)
5289                                 next = list_head(l2arc_dev_list);
5290                 }
5291
5292                 /* if we have come back to the start, bail out */
5293                 if (first == NULL)
5294                         first = next;
5295                 else if (next == first)
5296                         break;
5297
5298         } while (vdev_is_dead(next->l2ad_vdev));
5299
5300         /* if we were unable to find any usable vdevs, return NULL */
5301         if (vdev_is_dead(next->l2ad_vdev))
5302                 next = NULL;
5303
5304         l2arc_dev_last = next;
5305
5306 out:
5307         mutex_exit(&l2arc_dev_mtx);
5308
5309         /*
5310          * Grab the config lock to prevent the 'next' device from being
5311          * removed while we are writing to it.
5312          */
5313         if (next != NULL)
5314                 spa_config_enter(next->l2ad_spa, SCL_L2ARC, next, RW_READER);
5315         mutex_exit(&spa_namespace_lock);
5316
5317         return (next);
5318 }
5319
5320 /*
5321  * Free buffers that were tagged for destruction.
5322  */
5323 static void
5324 l2arc_do_free_on_write(void)
5325 {
5326         list_t *buflist;
5327         l2arc_data_free_t *df, *df_prev;
5328
5329         mutex_enter(&l2arc_free_on_write_mtx);
5330         buflist = l2arc_free_on_write;
5331
5332         for (df = list_tail(buflist); df; df = df_prev) {
5333                 df_prev = list_prev(buflist, df);
5334                 ASSERT(df->l2df_data != NULL);
5335                 ASSERT(df->l2df_func != NULL);
5336                 df->l2df_func(df->l2df_data, df->l2df_size);
5337                 list_remove(buflist, df);
5338                 kmem_free(df, sizeof (l2arc_data_free_t));
5339         }
5340
5341         mutex_exit(&l2arc_free_on_write_mtx);
5342 }
5343
5344 /*
5345  * A write to a cache device has completed.  Update all headers to allow
5346  * reads from these buffers to begin.
5347  */
5348 static void
5349 l2arc_write_done(zio_t *zio)
5350 {
5351         l2arc_write_callback_t *cb;
5352         l2arc_dev_t *dev;
5353         list_t *buflist;
5354         arc_buf_hdr_t *head, *hdr, *hdr_prev;
5355         kmutex_t *hash_lock;
5356         int64_t bytes_dropped = 0;
5357
5358         cb = zio->io_private;
5359         ASSERT(cb != NULL);
5360         dev = cb->l2wcb_dev;
5361         ASSERT(dev != NULL);
5362         head = cb->l2wcb_head;
5363         ASSERT(head != NULL);
5364         buflist = &dev->l2ad_buflist;
5365         ASSERT(buflist != NULL);
5366         DTRACE_PROBE2(l2arc__iodone, zio_t *, zio,
5367             l2arc_write_callback_t *, cb);
5368
5369         if (zio->io_error != 0)
5370                 ARCSTAT_BUMP(arcstat_l2_writes_error);
5371
5372         /*
5373          * All writes completed, or an error was hit.
5374          */
5375 top:
5376         mutex_enter(&dev->l2ad_mtx);
5377         for (hdr = list_prev(buflist, head); hdr; hdr = hdr_prev) {
5378                 hdr_prev = list_prev(buflist, hdr);
5379
5380                 hash_lock = HDR_LOCK(hdr);
5381
5382                 /*
5383                  * We cannot use mutex_enter or else we can deadlock
5384                  * with l2arc_write_buffers (due to swapping the order
5385                  * the hash lock and l2ad_mtx are taken).
5386                  */
5387                 if (!mutex_tryenter(hash_lock)) {
5388                         /*
5389                          * Missed the hash lock. We must retry so we
5390                          * don't leave the ARC_FLAG_L2_WRITING bit set.
5391                          */
5392                         ARCSTAT_BUMP(arcstat_l2_writes_lock_retry);
5393
5394                         /*
5395                          * We don't want to rescan the headers we've
5396                          * already marked as having been written out, so
5397                          * we reinsert the head node so we can pick up
5398                          * where we left off.
5399                          */
5400                         list_remove(buflist, head);
5401                         list_insert_after(buflist, hdr, head);
5402
5403                         mutex_exit(&dev->l2ad_mtx);
5404
5405                         /*
5406                          * We wait for the hash lock to become available
5407                          * to try and prevent busy waiting, and increase
5408                          * the chance we'll be able to acquire the lock
5409                          * the next time around.
5410                          */
5411                         mutex_enter(hash_lock);
5412                         mutex_exit(hash_lock);
5413                         goto top;
5414                 }
5415
5416                 /*
5417                  * We could not have been moved into the arc_l2c_only
5418                  * state while in-flight due to our ARC_FLAG_L2_WRITING
5419                  * bit being set. Let's just ensure that's being enforced.
5420                  */
5421                 ASSERT(HDR_HAS_L1HDR(hdr));
5422
5423                 /*
5424                  * We may have allocated a buffer for L2ARC compression,
5425                  * we must release it to avoid leaking this data.
5426                  */
5427                 l2arc_release_cdata_buf(hdr);
5428
5429                 if (zio->io_error != 0) {
5430                         /*
5431                          * Error - drop L2ARC entry.
5432                          */
5433                         list_remove(buflist, hdr);
5434                         hdr->b_flags &= ~ARC_FLAG_HAS_L2HDR;
5435
5436                         ARCSTAT_INCR(arcstat_l2_asize, -hdr->b_l2hdr.b_asize);
5437                         ARCSTAT_INCR(arcstat_l2_size, -hdr->b_size);
5438                 }
5439
5440                 /*
5441                  * Allow ARC to begin reads and ghost list evictions to
5442                  * this L2ARC entry.
5443                  */
5444                 hdr->b_flags &= ~ARC_FLAG_L2_WRITING;
5445
5446                 mutex_exit(hash_lock);
5447         }
5448
5449         atomic_inc_64(&l2arc_writes_done);
5450         list_remove(buflist, head);
5451         ASSERT(!HDR_HAS_L1HDR(head));
5452         kmem_cache_free(hdr_l2only_cache, head);
5453         mutex_exit(&dev->l2ad_mtx);
5454
5455         vdev_space_update(dev->l2ad_vdev, -bytes_dropped, 0, 0);
5456
5457         l2arc_do_free_on_write();
5458
5459         kmem_free(cb, sizeof (l2arc_write_callback_t));
5460 }
5461
5462 /*
5463  * A read to a cache device completed.  Validate buffer contents before
5464  * handing over to the regular ARC routines.
5465  */
5466 static void
5467 l2arc_read_done(zio_t *zio)
5468 {
5469         l2arc_read_callback_t *cb;
5470         arc_buf_hdr_t *hdr;
5471         arc_buf_t *buf;
5472         kmutex_t *hash_lock;
5473         int equal;
5474
5475         ASSERT(zio->io_vd != NULL);
5476         ASSERT(zio->io_flags & ZIO_FLAG_DONT_PROPAGATE);
5477
5478         spa_config_exit(zio->io_spa, SCL_L2ARC, zio->io_vd);
5479
5480         cb = zio->io_private;
5481         ASSERT(cb != NULL);
5482         buf = cb->l2rcb_buf;
5483         ASSERT(buf != NULL);
5484
5485         hash_lock = HDR_LOCK(buf->b_hdr);
5486         mutex_enter(hash_lock);
5487         hdr = buf->b_hdr;
5488         ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
5489
5490         /*
5491          * If the buffer was compressed, decompress it first.
5492          */
5493         if (cb->l2rcb_compress != ZIO_COMPRESS_OFF)
5494                 l2arc_decompress_zio(zio, hdr, cb->l2rcb_compress);
5495         ASSERT(zio->io_data != NULL);
5496
5497         /*
5498          * Check this survived the L2ARC journey.
5499          */
5500         equal = arc_cksum_equal(buf);
5501         if (equal && zio->io_error == 0 && !HDR_L2_EVICTED(hdr)) {
5502                 mutex_exit(hash_lock);
5503                 zio->io_private = buf;
5504                 zio->io_bp_copy = cb->l2rcb_bp; /* XXX fix in L2ARC 2.0 */
5505                 zio->io_bp = &zio->io_bp_copy;  /* XXX fix in L2ARC 2.0 */
5506                 arc_read_done(zio);
5507         } else {
5508                 mutex_exit(hash_lock);
5509                 /*
5510                  * Buffer didn't survive caching.  Increment stats and
5511                  * reissue to the original storage device.
5512                  */
5513                 if (zio->io_error != 0) {
5514                         ARCSTAT_BUMP(arcstat_l2_io_error);
5515                 } else {
5516                         zio->io_error = SET_ERROR(EIO);
5517                 }
5518                 if (!equal)
5519                         ARCSTAT_BUMP(arcstat_l2_cksum_bad);
5520
5521                 /*
5522                  * If there's no waiter, issue an async i/o to the primary
5523                  * storage now.  If there *is* a waiter, the caller must
5524                  * issue the i/o in a context where it's OK to block.
5525                  */
5526                 if (zio->io_waiter == NULL) {
5527                         zio_t *pio = zio_unique_parent(zio);
5528
5529                         ASSERT(!pio || pio->io_child_type == ZIO_CHILD_LOGICAL);
5530
5531                         zio_nowait(zio_read(pio, cb->l2rcb_spa, &cb->l2rcb_bp,
5532                             buf->b_data, zio->io_size, arc_read_done, buf,
5533                             zio->io_priority, cb->l2rcb_flags, &cb->l2rcb_zb));
5534                 }
5535         }
5536
5537         kmem_free(cb, sizeof (l2arc_read_callback_t));
5538 }
5539
5540 /*
5541  * This is the list priority from which the L2ARC will search for pages to
5542  * cache.  This is used within loops (0..3) to cycle through lists in the
5543  * desired order.  This order can have a significant effect on cache
5544  * performance.
5545  *
5546  * Currently the metadata lists are hit first, MFU then MRU, followed by
5547  * the data lists.  This function returns a locked list, and also returns
5548  * the lock pointer.
5549  */
5550 static multilist_sublist_t *
5551 l2arc_sublist_lock(int list_num)
5552 {
5553         multilist_t *ml = NULL;
5554         unsigned int idx;
5555
5556         ASSERT(list_num >= 0 && list_num <= 3);
5557
5558         switch (list_num) {
5559         case 0:
5560                 ml = &arc_mfu->arcs_list[ARC_BUFC_METADATA];
5561                 break;
5562         case 1:
5563                 ml = &arc_mru->arcs_list[ARC_BUFC_METADATA];
5564                 break;
5565         case 2:
5566                 ml = &arc_mfu->arcs_list[ARC_BUFC_DATA];
5567                 break;
5568         case 3:
5569                 ml = &arc_mru->arcs_list[ARC_BUFC_DATA];
5570                 break;
5571         }
5572
5573         /*
5574          * Return a randomly-selected sublist. This is acceptable
5575          * because the caller feeds only a little bit of data for each
5576          * call (8MB). Subsequent calls will result in different
5577          * sublists being selected.
5578          */
5579         idx = multilist_get_random_index(ml);
5580         return (multilist_sublist_lock(ml, idx));
5581 }
5582
5583 /*
5584  * Evict buffers from the device write hand to the distance specified in
5585  * bytes.  This distance may span populated buffers, it may span nothing.
5586  * This is clearing a region on the L2ARC device ready for writing.
5587  * If the 'all' boolean is set, every buffer is evicted.
5588  */
5589 static void
5590 l2arc_evict(l2arc_dev_t *dev, uint64_t distance, boolean_t all)
5591 {
5592         list_t *buflist;
5593         arc_buf_hdr_t *hdr, *hdr_prev;
5594         kmutex_t *hash_lock;
5595         uint64_t taddr;
5596         int64_t bytes_evicted = 0;
5597
5598         buflist = &dev->l2ad_buflist;
5599
5600         if (!all && dev->l2ad_first) {
5601                 /*
5602                  * This is the first sweep through the device.  There is
5603                  * nothing to evict.
5604                  */
5605                 return;
5606         }
5607
5608         if (dev->l2ad_hand >= (dev->l2ad_end - (2 * distance))) {
5609                 /*
5610                  * When nearing the end of the device, evict to the end
5611                  * before the device write hand jumps to the start.
5612                  */
5613                 taddr = dev->l2ad_end;
5614         } else {
5615                 taddr = dev->l2ad_hand + distance;
5616         }
5617         DTRACE_PROBE4(l2arc__evict, l2arc_dev_t *, dev, list_t *, buflist,
5618             uint64_t, taddr, boolean_t, all);
5619
5620 top:
5621         mutex_enter(&dev->l2ad_mtx);
5622         for (hdr = list_tail(buflist); hdr; hdr = hdr_prev) {
5623                 hdr_prev = list_prev(buflist, hdr);
5624
5625                 hash_lock = HDR_LOCK(hdr);
5626
5627                 /*
5628                  * We cannot use mutex_enter or else we can deadlock
5629                  * with l2arc_write_buffers (due to swapping the order
5630                  * the hash lock and l2ad_mtx are taken).
5631                  */
5632                 if (!mutex_tryenter(hash_lock)) {
5633                         /*
5634                          * Missed the hash lock.  Retry.
5635                          */
5636                         ARCSTAT_BUMP(arcstat_l2_evict_lock_retry);
5637                         mutex_exit(&dev->l2ad_mtx);
5638                         mutex_enter(hash_lock);
5639                         mutex_exit(hash_lock);
5640                         goto top;
5641                 }
5642
5643                 if (HDR_L2_WRITE_HEAD(hdr)) {
5644                         /*
5645                          * We hit a write head node.  Leave it for
5646                          * l2arc_write_done().
5647                          */
5648                         list_remove(buflist, hdr);
5649                         mutex_exit(hash_lock);
5650                         continue;
5651                 }
5652
5653                 if (!all && HDR_HAS_L2HDR(hdr) &&
5654                     (hdr->b_l2hdr.b_daddr > taddr ||
5655                     hdr->b_l2hdr.b_daddr < dev->l2ad_hand)) {
5656                         /*
5657                          * We've evicted to the target address,
5658                          * or the end of the device.
5659                          */
5660                         mutex_exit(hash_lock);
5661                         break;
5662                 }
5663
5664                 ASSERT(HDR_HAS_L2HDR(hdr));
5665                 if (!HDR_HAS_L1HDR(hdr)) {
5666                         ASSERT(!HDR_L2_READING(hdr));
5667                         /*
5668                          * This doesn't exist in the ARC.  Destroy.
5669                          * arc_hdr_destroy() will call list_remove()
5670                          * and decrement arcstat_l2_size.
5671                          */
5672                         arc_change_state(arc_anon, hdr, hash_lock);
5673                         arc_hdr_destroy(hdr);
5674                 } else {
5675                         ASSERT(hdr->b_l1hdr.b_state != arc_l2c_only);
5676                         ARCSTAT_BUMP(arcstat_l2_evict_l1cached);
5677                         /*
5678                          * Invalidate issued or about to be issued
5679                          * reads, since we may be about to write
5680                          * over this location.
5681                          */
5682                         if (HDR_L2_READING(hdr)) {
5683                                 ARCSTAT_BUMP(arcstat_l2_evict_reading);
5684                                 hdr->b_flags |= ARC_FLAG_L2_EVICTED;
5685                         }
5686
5687                         /*
5688                          * Tell ARC this no longer exists in L2ARC.
5689                          */
5690                         /* Tell ARC this no longer exists in L2ARC. */
5691                         ARCSTAT_INCR(arcstat_l2_asize, -hdr->b_l2hdr.b_asize);
5692                         ARCSTAT_INCR(arcstat_l2_size, -hdr->b_size);
5693                         hdr->b_flags &= ~ARC_FLAG_HAS_L2HDR;
5694                         list_remove(buflist, hdr);
5695
5696                         /* Ensure this header has finished being written */
5697                         ASSERT(!HDR_L2_WRITING(hdr));
5698                         ASSERT3P(hdr->b_l1hdr.b_tmp_cdata, ==, NULL);
5699                 }
5700                 mutex_exit(hash_lock);
5701         }
5702         mutex_exit(&dev->l2ad_mtx);
5703
5704         vdev_space_update(dev->l2ad_vdev, -bytes_evicted, 0, 0);
5705         dev->l2ad_evict = taddr;
5706 }
5707
5708 /*
5709  * Find and write ARC buffers to the L2ARC device.
5710  *
5711  * An ARC_FLAG_L2_WRITING flag is set so that the L2ARC buffers are not valid
5712  * for reading until they have completed writing.
5713  * The headroom_boost is an in-out parameter used to maintain headroom boost
5714  * state between calls to this function.
5715  *
5716  * Returns the number of bytes actually written (which may be smaller than
5717  * the delta by which the device hand has changed due to alignment).
5718  */
5719 static uint64_t
5720 l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz,
5721     boolean_t *headroom_boost)
5722 {
5723         arc_buf_hdr_t *hdr, *hdr_prev, *head;
5724         uint64_t write_asize, write_psize, write_sz, headroom,
5725             buf_compress_minsz;
5726         void *buf_data;
5727         boolean_t full;
5728         l2arc_write_callback_t *cb;
5729         zio_t *pio, *wzio;
5730         uint64_t guid = spa_load_guid(spa);
5731         int try;
5732         const boolean_t do_headroom_boost = *headroom_boost;
5733
5734         ASSERT(dev->l2ad_vdev != NULL);
5735
5736         /* Lower the flag now, we might want to raise it again later. */
5737         *headroom_boost = B_FALSE;
5738
5739         pio = NULL;
5740         write_sz = write_asize = write_psize = 0;
5741         full = B_FALSE;
5742         head = kmem_cache_alloc(hdr_l2only_cache, KM_PUSHPAGE);
5743         head->b_flags |= ARC_FLAG_L2_WRITE_HEAD;
5744         head->b_flags |= ARC_FLAG_HAS_L2HDR;
5745
5746         /*
5747          * We will want to try to compress buffers that are at least 2x the
5748          * device sector size.
5749          */
5750         buf_compress_minsz = 2 << dev->l2ad_vdev->vdev_ashift;
5751
5752         /*
5753          * Copy buffers for L2ARC writing.
5754          */
5755         for (try = 0; try <= 3; try++) {
5756                 multilist_sublist_t *mls = l2arc_sublist_lock(try);
5757                 uint64_t passed_sz = 0;
5758
5759                 /*
5760                  * L2ARC fast warmup.
5761                  *
5762                  * Until the ARC is warm and starts to evict, read from the
5763                  * head of the ARC lists rather than the tail.
5764                  */
5765                 if (arc_warm == B_FALSE)
5766                         hdr = multilist_sublist_head(mls);
5767                 else
5768                         hdr = multilist_sublist_tail(mls);
5769
5770                 headroom = target_sz * l2arc_headroom;
5771                 if (do_headroom_boost)
5772                         headroom = (headroom * l2arc_headroom_boost) / 100;
5773
5774                 for (; hdr; hdr = hdr_prev) {
5775                         kmutex_t *hash_lock;
5776                         uint64_t buf_sz;
5777
5778                         if (arc_warm == B_FALSE)
5779                                 hdr_prev = multilist_sublist_next(mls, hdr);
5780                         else
5781                                 hdr_prev = multilist_sublist_prev(mls, hdr);
5782
5783                         hash_lock = HDR_LOCK(hdr);
5784                         if (!mutex_tryenter(hash_lock)) {
5785                                 /*
5786                                  * Skip this buffer rather than waiting.
5787                                  */
5788                                 continue;
5789                         }
5790
5791                         passed_sz += hdr->b_size;
5792                         if (passed_sz > headroom) {
5793                                 /*
5794                                  * Searched too far.
5795                                  */
5796                                 mutex_exit(hash_lock);
5797                                 break;
5798                         }
5799
5800                         if (!l2arc_write_eligible(guid, hdr)) {
5801                                 mutex_exit(hash_lock);
5802                                 continue;
5803                         }
5804
5805                         if ((write_sz + hdr->b_size) > target_sz) {
5806                                 full = B_TRUE;
5807                                 mutex_exit(hash_lock);
5808                                 break;
5809                         }
5810
5811                         if (pio == NULL) {
5812                                 /*
5813                                  * Insert a dummy header on the buflist so
5814                                  * l2arc_write_done() can find where the
5815                                  * write buffers begin without searching.
5816                                  */
5817                                 mutex_enter(&dev->l2ad_mtx);
5818                                 list_insert_head(&dev->l2ad_buflist, head);
5819                                 mutex_exit(&dev->l2ad_mtx);
5820
5821                                 cb = kmem_alloc(sizeof (l2arc_write_callback_t),
5822                                     KM_SLEEP);
5823                                 cb->l2wcb_dev = dev;
5824                                 cb->l2wcb_head = head;
5825                                 pio = zio_root(spa, l2arc_write_done, cb,
5826                                     ZIO_FLAG_CANFAIL);
5827                         }
5828
5829                         /*
5830                          * Create and add a new L2ARC header.
5831                          */
5832                         hdr->b_l2hdr.b_dev = dev;
5833                         arc_space_consume(HDR_L2ONLY_SIZE, ARC_SPACE_L2HDRS);
5834                         hdr->b_flags |= ARC_FLAG_L2_WRITING;
5835                         /*
5836                          * Temporarily stash the data buffer in b_tmp_cdata.
5837                          * The subsequent write step will pick it up from
5838                          * there. This is because can't access b_l1hdr.b_buf
5839                          * without holding the hash_lock, which we in turn
5840                          * can't access without holding the ARC list locks
5841                          * (which we want to avoid during compression/writing)
5842                          */
5843                         HDR_SET_COMPRESS(hdr, ZIO_COMPRESS_OFF);
5844                         hdr->b_l2hdr.b_asize = hdr->b_size;
5845                         hdr->b_l2hdr.b_hits = 0;
5846                         hdr->b_l1hdr.b_tmp_cdata = hdr->b_l1hdr.b_buf->b_data;
5847
5848                         buf_sz = hdr->b_size;
5849                         hdr->b_flags |= ARC_FLAG_HAS_L2HDR;
5850
5851                         mutex_enter(&dev->l2ad_mtx);
5852                         list_insert_head(&dev->l2ad_buflist, hdr);
5853                         mutex_exit(&dev->l2ad_mtx);
5854
5855                         /*
5856                          * Compute and store the buffer cksum before
5857                          * writing.  On debug the cksum is verified first.
5858                          */
5859                         arc_cksum_verify(hdr->b_l1hdr.b_buf);
5860                         arc_cksum_compute(hdr->b_l1hdr.b_buf, B_TRUE);
5861
5862                         mutex_exit(hash_lock);
5863
5864                         write_sz += buf_sz;
5865                 }
5866
5867                 multilist_sublist_unlock(mls);
5868
5869                 if (full == B_TRUE)
5870                         break;
5871         }
5872
5873         /* No buffers selected for writing? */
5874         if (pio == NULL) {
5875                 ASSERT0(write_sz);
5876                 ASSERT(!HDR_HAS_L1HDR(head));
5877                 kmem_cache_free(hdr_l2only_cache, head);
5878                 return (0);
5879         }
5880
5881         mutex_enter(&dev->l2ad_mtx);
5882
5883         /*
5884          * Now start writing the buffers. We're starting at the write head
5885          * and work backwards, retracing the course of the buffer selector
5886          * loop above.
5887          */
5888         for (hdr = list_prev(&dev->l2ad_buflist, head); hdr;
5889             hdr = list_prev(&dev->l2ad_buflist, hdr)) {
5890                 uint64_t buf_sz;
5891
5892                 /*
5893                  * We rely on the L1 portion of the header below, so
5894                  * it's invalid for this header to have been evicted out
5895                  * of the ghost cache, prior to being written out. The
5896                  * ARC_FLAG_L2_WRITING bit ensures this won't happen.
5897                  */
5898                 ASSERT(HDR_HAS_L1HDR(hdr));
5899
5900                 /*
5901                  * We shouldn't need to lock the buffer here, since we flagged
5902                  * it as ARC_FLAG_L2_WRITING in the previous step, but we must
5903                  * take care to only access its L2 cache parameters. In
5904                  * particular, hdr->l1hdr.b_buf may be invalid by now due to
5905                  * ARC eviction.
5906                  */
5907                 hdr->b_l2hdr.b_daddr = dev->l2ad_hand;
5908
5909                 if ((!l2arc_nocompress && HDR_L2COMPRESS(hdr)) &&
5910                     hdr->b_l2hdr.b_asize >= buf_compress_minsz) {
5911                         if (l2arc_compress_buf(hdr)) {
5912                                 /*
5913                                  * If compression succeeded, enable headroom
5914                                  * boost on the next scan cycle.
5915                                  */
5916                                 *headroom_boost = B_TRUE;
5917                         }
5918                 }
5919
5920                 /*
5921                  * Pick up the buffer data we had previously stashed away
5922                  * (and now potentially also compressed).
5923                  */
5924                 buf_data = hdr->b_l1hdr.b_tmp_cdata;
5925                 buf_sz = hdr->b_l2hdr.b_asize;
5926
5927                 /* Compression may have squashed the buffer to zero length. */
5928                 if (buf_sz != 0) {
5929                         uint64_t buf_p_sz;
5930
5931                         wzio = zio_write_phys(pio, dev->l2ad_vdev,
5932                             dev->l2ad_hand, buf_sz, buf_data, ZIO_CHECKSUM_OFF,
5933                             NULL, NULL, ZIO_PRIORITY_ASYNC_WRITE,
5934                             ZIO_FLAG_CANFAIL, B_FALSE);
5935
5936                         DTRACE_PROBE2(l2arc__write, vdev_t *, dev->l2ad_vdev,
5937                             zio_t *, wzio);
5938                         (void) zio_nowait(wzio);
5939
5940                         write_asize += buf_sz;
5941                         /*
5942                          * Keep the clock hand suitably device-aligned.
5943                          */
5944                         buf_p_sz = vdev_psize_to_asize(dev->l2ad_vdev, buf_sz);
5945                         write_psize += buf_p_sz;
5946                         dev->l2ad_hand += buf_p_sz;
5947                 }
5948         }
5949
5950         mutex_exit(&dev->l2ad_mtx);
5951
5952         ASSERT3U(write_asize, <=, target_sz);
5953         ARCSTAT_BUMP(arcstat_l2_writes_sent);
5954         ARCSTAT_INCR(arcstat_l2_write_bytes, write_asize);
5955         ARCSTAT_INCR(arcstat_l2_size, write_sz);
5956         ARCSTAT_INCR(arcstat_l2_asize, write_asize);
5957         vdev_space_update(dev->l2ad_vdev, write_asize, 0, 0);
5958
5959         /*
5960          * Bump device hand to the device start if it is approaching the end.
5961          * l2arc_evict() will already have evicted ahead for this case.
5962          */
5963         if (dev->l2ad_hand >= (dev->l2ad_end - target_sz)) {
5964                 dev->l2ad_hand = dev->l2ad_start;
5965                 dev->l2ad_evict = dev->l2ad_start;
5966                 dev->l2ad_first = B_FALSE;
5967         }
5968
5969         dev->l2ad_writing = B_TRUE;
5970         (void) zio_wait(pio);
5971         dev->l2ad_writing = B_FALSE;
5972
5973         return (write_asize);
5974 }
5975
5976 /*
5977  * Compresses an L2ARC buffer.
5978  * The data to be compressed must be prefilled in l1hdr.b_tmp_cdata and its
5979  * size in l2hdr->b_asize. This routine tries to compress the data and
5980  * depending on the compression result there are three possible outcomes:
5981  * *) The buffer was incompressible. The original l2hdr contents were left
5982  *    untouched and are ready for writing to an L2 device.
5983  * *) The buffer was all-zeros, so there is no need to write it to an L2
5984  *    device. To indicate this situation b_tmp_cdata is NULL'ed, b_asize is
5985  *    set to zero and b_compress is set to ZIO_COMPRESS_EMPTY.
5986  * *) Compression succeeded and b_tmp_cdata was replaced with a temporary
5987  *    data buffer which holds the compressed data to be written, and b_asize
5988  *    tells us how much data there is. b_compress is set to the appropriate
5989  *    compression algorithm. Once writing is done, invoke
5990  *    l2arc_release_cdata_buf on this l2hdr to free this temporary buffer.
5991  *
5992  * Returns B_TRUE if compression succeeded, or B_FALSE if it didn't (the
5993  * buffer was incompressible).
5994  */
5995 static boolean_t
5996 l2arc_compress_buf(arc_buf_hdr_t *hdr)
5997 {
5998         void *cdata;
5999         size_t csize, len, rounded;
6000         l2arc_buf_hdr_t *l2hdr;
6001
6002         ASSERT(HDR_HAS_L2HDR(hdr));
6003
6004         l2hdr = &hdr->b_l2hdr;
6005
6006         ASSERT(HDR_HAS_L1HDR(hdr));
6007         ASSERT(HDR_GET_COMPRESS(hdr) == ZIO_COMPRESS_OFF);
6008         ASSERT(hdr->b_l1hdr.b_tmp_cdata != NULL);
6009
6010         len = l2hdr->b_asize;
6011         cdata = zio_data_buf_alloc(len);
6012         ASSERT3P(cdata, !=, NULL);
6013         csize = zio_compress_data(ZIO_COMPRESS_LZ4, hdr->b_l1hdr.b_tmp_cdata,
6014             cdata, l2hdr->b_asize);
6015
6016         rounded = P2ROUNDUP(csize, (size_t)SPA_MINBLOCKSIZE);
6017         if (rounded > csize) {
6018                 bzero((char *)cdata + csize, rounded - csize);
6019                 csize = rounded;
6020         }
6021
6022         if (csize == 0) {
6023                 /* zero block, indicate that there's nothing to write */
6024                 zio_data_buf_free(cdata, len);
6025                 HDR_SET_COMPRESS(hdr, ZIO_COMPRESS_EMPTY);
6026                 l2hdr->b_asize = 0;
6027                 hdr->b_l1hdr.b_tmp_cdata = NULL;
6028                 ARCSTAT_BUMP(arcstat_l2_compress_zeros);
6029                 return (B_TRUE);
6030         } else if (csize > 0 && csize < len) {
6031                 /*
6032                  * Compression succeeded, we'll keep the cdata around for
6033                  * writing and release it afterwards.
6034                  */
6035                 HDR_SET_COMPRESS(hdr, ZIO_COMPRESS_LZ4);
6036                 l2hdr->b_asize = csize;
6037                 hdr->b_l1hdr.b_tmp_cdata = cdata;
6038                 ARCSTAT_BUMP(arcstat_l2_compress_successes);
6039                 return (B_TRUE);
6040         } else {
6041                 /*
6042                  * Compression failed, release the compressed buffer.
6043                  * l2hdr will be left unmodified.
6044                  */
6045                 zio_data_buf_free(cdata, len);
6046                 ARCSTAT_BUMP(arcstat_l2_compress_failures);
6047                 return (B_FALSE);
6048         }
6049 }
6050
6051 /*
6052  * Decompresses a zio read back from an l2arc device. On success, the
6053  * underlying zio's io_data buffer is overwritten by the uncompressed
6054  * version. On decompression error (corrupt compressed stream), the
6055  * zio->io_error value is set to signal an I/O error.
6056  *
6057  * Please note that the compressed data stream is not checksummed, so
6058  * if the underlying device is experiencing data corruption, we may feed
6059  * corrupt data to the decompressor, so the decompressor needs to be
6060  * able to handle this situation (LZ4 does).
6061  */
6062 static void
6063 l2arc_decompress_zio(zio_t *zio, arc_buf_hdr_t *hdr, enum zio_compress c)
6064 {
6065         uint64_t csize;
6066         void *cdata;
6067
6068         ASSERT(L2ARC_IS_VALID_COMPRESS(c));
6069
6070         if (zio->io_error != 0) {
6071                 /*
6072                  * An io error has occured, just restore the original io
6073                  * size in preparation for a main pool read.
6074                  */
6075                 zio->io_orig_size = zio->io_size = hdr->b_size;
6076                 return;
6077         }
6078
6079         if (c == ZIO_COMPRESS_EMPTY) {
6080                 /*
6081                  * An empty buffer results in a null zio, which means we
6082                  * need to fill its io_data after we're done restoring the
6083                  * buffer's contents.
6084                  */
6085                 ASSERT(hdr->b_l1hdr.b_buf != NULL);
6086                 bzero(hdr->b_l1hdr.b_buf->b_data, hdr->b_size);
6087                 zio->io_data = zio->io_orig_data = hdr->b_l1hdr.b_buf->b_data;
6088         } else {
6089                 ASSERT(zio->io_data != NULL);
6090                 /*
6091                  * We copy the compressed data from the start of the arc buffer
6092                  * (the zio_read will have pulled in only what we need, the
6093                  * rest is garbage which we will overwrite at decompression)
6094                  * and then decompress back to the ARC data buffer. This way we
6095                  * can minimize copying by simply decompressing back over the
6096                  * original compressed data (rather than decompressing to an
6097                  * aux buffer and then copying back the uncompressed buffer,
6098                  * which is likely to be much larger).
6099                  */
6100                 csize = zio->io_size;
6101                 cdata = zio_data_buf_alloc(csize);
6102                 bcopy(zio->io_data, cdata, csize);
6103                 if (zio_decompress_data(c, cdata, zio->io_data, csize,
6104                     hdr->b_size) != 0)
6105                         zio->io_error = SET_ERROR(EIO);
6106                 zio_data_buf_free(cdata, csize);
6107         }
6108
6109         /* Restore the expected uncompressed IO size. */
6110         zio->io_orig_size = zio->io_size = hdr->b_size;
6111 }
6112
6113 /*
6114  * Releases the temporary b_tmp_cdata buffer in an l2arc header structure.
6115  * This buffer serves as a temporary holder of compressed data while
6116  * the buffer entry is being written to an l2arc device. Once that is
6117  * done, we can dispose of it.
6118  */
6119 static void
6120 l2arc_release_cdata_buf(arc_buf_hdr_t *hdr)
6121 {
6122         enum zio_compress comp = HDR_GET_COMPRESS(hdr);
6123
6124         ASSERT(HDR_HAS_L1HDR(hdr));
6125         ASSERT(comp == ZIO_COMPRESS_OFF || L2ARC_IS_VALID_COMPRESS(comp));
6126
6127         if (comp == ZIO_COMPRESS_OFF) {
6128                 /*
6129                  * In this case, b_tmp_cdata points to the same buffer
6130                  * as the arc_buf_t's b_data field. We don't want to
6131                  * free it, since the arc_buf_t will handle that.
6132                  */
6133                 hdr->b_l1hdr.b_tmp_cdata = NULL;
6134         } else if (comp == ZIO_COMPRESS_EMPTY) {
6135                 /*
6136                  * In this case, b_tmp_cdata was compressed to an empty
6137                  * buffer, thus there's nothing to free and b_tmp_cdata
6138                  * should have been set to NULL in l2arc_write_buffers().
6139                  */
6140                 ASSERT3P(hdr->b_l1hdr.b_tmp_cdata, ==, NULL);
6141         } else {
6142                 /*
6143                  * If the data was compressed, then we've allocated a
6144                  * temporary buffer for it, so now we need to release it.
6145                  */
6146                 ASSERT(hdr->b_l1hdr.b_tmp_cdata != NULL);
6147                 zio_data_buf_free(hdr->b_l1hdr.b_tmp_cdata,
6148                     hdr->b_size);
6149                 hdr->b_l1hdr.b_tmp_cdata = NULL;
6150         }
6151
6152 }
6153
6154 /*
6155  * This thread feeds the L2ARC at regular intervals.  This is the beating
6156  * heart of the L2ARC.
6157  */
6158 static void
6159 l2arc_feed_thread(void)
6160 {
6161         callb_cpr_t cpr;
6162         l2arc_dev_t *dev;
6163         spa_t *spa;
6164         uint64_t size, wrote;
6165         clock_t begin, next = ddi_get_lbolt();
6166         boolean_t headroom_boost = B_FALSE;
6167         fstrans_cookie_t cookie;
6168
6169         CALLB_CPR_INIT(&cpr, &l2arc_feed_thr_lock, callb_generic_cpr, FTAG);
6170
6171         mutex_enter(&l2arc_feed_thr_lock);
6172
6173         cookie = spl_fstrans_mark();
6174         while (l2arc_thread_exit == 0) {
6175                 CALLB_CPR_SAFE_BEGIN(&cpr);
6176                 (void) cv_timedwait_interruptible(&l2arc_feed_thr_cv,
6177                     &l2arc_feed_thr_lock, next);
6178                 CALLB_CPR_SAFE_END(&cpr, &l2arc_feed_thr_lock);
6179                 next = ddi_get_lbolt() + hz;
6180
6181                 /*
6182                  * Quick check for L2ARC devices.
6183                  */
6184                 mutex_enter(&l2arc_dev_mtx);
6185                 if (l2arc_ndev == 0) {
6186                         mutex_exit(&l2arc_dev_mtx);
6187                         continue;
6188                 }
6189                 mutex_exit(&l2arc_dev_mtx);
6190                 begin = ddi_get_lbolt();
6191
6192                 /*
6193                  * This selects the next l2arc device to write to, and in
6194                  * doing so the next spa to feed from: dev->l2ad_spa.   This
6195                  * will return NULL if there are now no l2arc devices or if
6196                  * they are all faulted.
6197                  *
6198                  * If a device is returned, its spa's config lock is also
6199                  * held to prevent device removal.  l2arc_dev_get_next()
6200                  * will grab and release l2arc_dev_mtx.
6201                  */
6202                 if ((dev = l2arc_dev_get_next()) == NULL)
6203                         continue;
6204
6205                 spa = dev->l2ad_spa;
6206                 ASSERT(spa != NULL);
6207
6208                 /*
6209                  * If the pool is read-only then force the feed thread to
6210                  * sleep a little longer.
6211                  */
6212                 if (!spa_writeable(spa)) {
6213                         next = ddi_get_lbolt() + 5 * l2arc_feed_secs * hz;
6214                         spa_config_exit(spa, SCL_L2ARC, dev);
6215                         continue;
6216                 }
6217
6218                 /*
6219                  * Avoid contributing to memory pressure.
6220                  */
6221                 if (arc_no_grow) {
6222                         ARCSTAT_BUMP(arcstat_l2_abort_lowmem);
6223                         spa_config_exit(spa, SCL_L2ARC, dev);
6224                         continue;
6225                 }
6226
6227                 ARCSTAT_BUMP(arcstat_l2_feeds);
6228
6229                 size = l2arc_write_size();
6230
6231                 /*
6232                  * Evict L2ARC buffers that will be overwritten.
6233                  */
6234                 l2arc_evict(dev, size, B_FALSE);
6235
6236                 /*
6237                  * Write ARC buffers.
6238                  */
6239                 wrote = l2arc_write_buffers(spa, dev, size, &headroom_boost);
6240
6241                 /*
6242                  * Calculate interval between writes.
6243                  */
6244                 next = l2arc_write_interval(begin, size, wrote);
6245                 spa_config_exit(spa, SCL_L2ARC, dev);
6246         }
6247         spl_fstrans_unmark(cookie);
6248
6249         l2arc_thread_exit = 0;
6250         cv_broadcast(&l2arc_feed_thr_cv);
6251         CALLB_CPR_EXIT(&cpr);           /* drops l2arc_feed_thr_lock */
6252         thread_exit();
6253 }
6254
6255 boolean_t
6256 l2arc_vdev_present(vdev_t *vd)
6257 {
6258         l2arc_dev_t *dev;
6259
6260         mutex_enter(&l2arc_dev_mtx);
6261         for (dev = list_head(l2arc_dev_list); dev != NULL;
6262             dev = list_next(l2arc_dev_list, dev)) {
6263                 if (dev->l2ad_vdev == vd)
6264                         break;
6265         }
6266         mutex_exit(&l2arc_dev_mtx);
6267
6268         return (dev != NULL);
6269 }
6270
6271 /*
6272  * Add a vdev for use by the L2ARC.  By this point the spa has already
6273  * validated the vdev and opened it.
6274  */
6275 void
6276 l2arc_add_vdev(spa_t *spa, vdev_t *vd)
6277 {
6278         l2arc_dev_t *adddev;
6279
6280         ASSERT(!l2arc_vdev_present(vd));
6281
6282         /*
6283          * Create a new l2arc device entry.
6284          */
6285         adddev = kmem_zalloc(sizeof (l2arc_dev_t), KM_SLEEP);
6286         adddev->l2ad_spa = spa;
6287         adddev->l2ad_vdev = vd;
6288         adddev->l2ad_start = VDEV_LABEL_START_SIZE;
6289         adddev->l2ad_end = VDEV_LABEL_START_SIZE + vdev_get_min_asize(vd);
6290         adddev->l2ad_hand = adddev->l2ad_start;
6291         adddev->l2ad_evict = adddev->l2ad_start;
6292         adddev->l2ad_first = B_TRUE;
6293         adddev->l2ad_writing = B_FALSE;
6294         list_link_init(&adddev->l2ad_node);
6295
6296         mutex_init(&adddev->l2ad_mtx, NULL, MUTEX_DEFAULT, NULL);
6297         /*
6298          * This is a list of all ARC buffers that are still valid on the
6299          * device.
6300          */
6301         list_create(&adddev->l2ad_buflist, sizeof (arc_buf_hdr_t),
6302             offsetof(arc_buf_hdr_t, b_l2hdr.b_l2node));
6303
6304         vdev_space_update(vd, 0, 0, adddev->l2ad_end - adddev->l2ad_hand);
6305
6306         /*
6307          * Add device to global list
6308          */
6309         mutex_enter(&l2arc_dev_mtx);
6310         list_insert_head(l2arc_dev_list, adddev);
6311         atomic_inc_64(&l2arc_ndev);
6312         mutex_exit(&l2arc_dev_mtx);
6313 }
6314
6315 /*
6316  * Remove a vdev from the L2ARC.
6317  */
6318 void
6319 l2arc_remove_vdev(vdev_t *vd)
6320 {
6321         l2arc_dev_t *dev, *nextdev, *remdev = NULL;
6322
6323         /*
6324          * Find the device by vdev
6325          */
6326         mutex_enter(&l2arc_dev_mtx);
6327         for (dev = list_head(l2arc_dev_list); dev; dev = nextdev) {
6328                 nextdev = list_next(l2arc_dev_list, dev);
6329                 if (vd == dev->l2ad_vdev) {
6330                         remdev = dev;
6331                         break;
6332                 }
6333         }
6334         ASSERT(remdev != NULL);
6335
6336         /*
6337          * Remove device from global list
6338          */
6339         list_remove(l2arc_dev_list, remdev);
6340         l2arc_dev_last = NULL;          /* may have been invalidated */
6341         atomic_dec_64(&l2arc_ndev);
6342         mutex_exit(&l2arc_dev_mtx);
6343
6344         /*
6345          * Clear all buflists and ARC references.  L2ARC device flush.
6346          */
6347         l2arc_evict(remdev, 0, B_TRUE);
6348         list_destroy(&remdev->l2ad_buflist);
6349         mutex_destroy(&remdev->l2ad_mtx);
6350         kmem_free(remdev, sizeof (l2arc_dev_t));
6351 }
6352
6353 void
6354 l2arc_init(void)
6355 {
6356         l2arc_thread_exit = 0;
6357         l2arc_ndev = 0;
6358         l2arc_writes_sent = 0;
6359         l2arc_writes_done = 0;
6360
6361         mutex_init(&l2arc_feed_thr_lock, NULL, MUTEX_DEFAULT, NULL);
6362         cv_init(&l2arc_feed_thr_cv, NULL, CV_DEFAULT, NULL);
6363         mutex_init(&l2arc_dev_mtx, NULL, MUTEX_DEFAULT, NULL);
6364         mutex_init(&l2arc_free_on_write_mtx, NULL, MUTEX_DEFAULT, NULL);
6365
6366         l2arc_dev_list = &L2ARC_dev_list;
6367         l2arc_free_on_write = &L2ARC_free_on_write;
6368         list_create(l2arc_dev_list, sizeof (l2arc_dev_t),
6369             offsetof(l2arc_dev_t, l2ad_node));
6370         list_create(l2arc_free_on_write, sizeof (l2arc_data_free_t),
6371             offsetof(l2arc_data_free_t, l2df_list_node));
6372 }
6373
6374 void
6375 l2arc_fini(void)
6376 {
6377         /*
6378          * This is called from dmu_fini(), which is called from spa_fini();
6379          * Because of this, we can assume that all l2arc devices have
6380          * already been removed when the pools themselves were removed.
6381          */
6382
6383         l2arc_do_free_on_write();
6384
6385         mutex_destroy(&l2arc_feed_thr_lock);
6386         cv_destroy(&l2arc_feed_thr_cv);
6387         mutex_destroy(&l2arc_dev_mtx);
6388         mutex_destroy(&l2arc_free_on_write_mtx);
6389
6390         list_destroy(l2arc_dev_list);
6391         list_destroy(l2arc_free_on_write);
6392 }
6393
6394 void
6395 l2arc_start(void)
6396 {
6397         if (!(spa_mode_global & FWRITE))
6398                 return;
6399
6400         (void) thread_create(NULL, 0, l2arc_feed_thread, NULL, 0, &p0,
6401             TS_RUN, minclsyspri);
6402 }
6403
6404 void
6405 l2arc_stop(void)
6406 {
6407         if (!(spa_mode_global & FWRITE))
6408                 return;
6409
6410         mutex_enter(&l2arc_feed_thr_lock);
6411         cv_signal(&l2arc_feed_thr_cv);  /* kick thread out of startup */
6412         l2arc_thread_exit = 1;
6413         while (l2arc_thread_exit != 0)
6414                 cv_wait(&l2arc_feed_thr_cv, &l2arc_feed_thr_lock);
6415         mutex_exit(&l2arc_feed_thr_lock);
6416 }
6417
6418 #if defined(_KERNEL) && defined(HAVE_SPL)
6419 EXPORT_SYMBOL(arc_buf_size);
6420 EXPORT_SYMBOL(arc_write);
6421 EXPORT_SYMBOL(arc_read);
6422 EXPORT_SYMBOL(arc_buf_remove_ref);
6423 EXPORT_SYMBOL(arc_buf_info);
6424 EXPORT_SYMBOL(arc_getbuf_func);
6425 EXPORT_SYMBOL(arc_add_prune_callback);
6426 EXPORT_SYMBOL(arc_remove_prune_callback);
6427
6428 module_param(zfs_arc_min, ulong, 0644);
6429 MODULE_PARM_DESC(zfs_arc_min, "Min arc size");
6430
6431 module_param(zfs_arc_max, ulong, 0644);
6432 MODULE_PARM_DESC(zfs_arc_max, "Max arc size");
6433
6434 module_param(zfs_arc_meta_limit, ulong, 0644);
6435 MODULE_PARM_DESC(zfs_arc_meta_limit, "Meta limit for arc size");
6436
6437 module_param(zfs_arc_meta_min, ulong, 0644);
6438 MODULE_PARM_DESC(zfs_arc_meta_min, "Min arc metadata");
6439
6440 module_param(zfs_arc_meta_prune, int, 0644);
6441 MODULE_PARM_DESC(zfs_arc_meta_prune, "Meta objects to scan for prune");
6442
6443 module_param(zfs_arc_meta_adjust_restarts, ulong, 0644);
6444 MODULE_PARM_DESC(zfs_arc_meta_adjust_restarts,
6445         "Limit number of restarts in arc_adjust_meta");
6446
6447 module_param(zfs_arc_meta_strategy, int, 0644);
6448 MODULE_PARM_DESC(zfs_arc_meta_strategy, "Meta reclaim strategy");
6449
6450 module_param(zfs_arc_grow_retry, int, 0644);
6451 MODULE_PARM_DESC(zfs_arc_grow_retry, "Seconds before growing arc size");
6452
6453 module_param(zfs_arc_p_aggressive_disable, int, 0644);
6454 MODULE_PARM_DESC(zfs_arc_p_aggressive_disable, "disable aggressive arc_p grow");
6455
6456 module_param(zfs_arc_p_dampener_disable, int, 0644);
6457 MODULE_PARM_DESC(zfs_arc_p_dampener_disable, "disable arc_p adapt dampener");
6458
6459 module_param(zfs_arc_shrink_shift, int, 0644);
6460 MODULE_PARM_DESC(zfs_arc_shrink_shift, "log2(fraction of arc to reclaim)");
6461
6462 module_param(zfs_disable_dup_eviction, int, 0644);
6463 MODULE_PARM_DESC(zfs_disable_dup_eviction, "disable duplicate buffer eviction");
6464
6465 module_param(zfs_arc_average_blocksize, int, 0444);
6466 MODULE_PARM_DESC(zfs_arc_average_blocksize, "Target average block size");
6467
6468 module_param(zfs_arc_memory_throttle_disable, int, 0644);
6469 MODULE_PARM_DESC(zfs_arc_memory_throttle_disable, "disable memory throttle");
6470
6471 module_param(zfs_arc_min_prefetch_lifespan, int, 0644);
6472 MODULE_PARM_DESC(zfs_arc_min_prefetch_lifespan, "Min life of prefetch block");
6473
6474 module_param(zfs_arc_num_sublists_per_state, int, 0644);
6475 MODULE_PARM_DESC(zfs_arc_num_sublists_per_state,
6476         "Number of sublists used in each of the ARC state lists");
6477
6478 module_param(l2arc_write_max, ulong, 0644);
6479 MODULE_PARM_DESC(l2arc_write_max, "Max write bytes per interval");
6480
6481 module_param(l2arc_write_boost, ulong, 0644);
6482 MODULE_PARM_DESC(l2arc_write_boost, "Extra write bytes during device warmup");
6483
6484 module_param(l2arc_headroom, ulong, 0644);
6485 MODULE_PARM_DESC(l2arc_headroom, "Number of max device writes to precache");
6486
6487 module_param(l2arc_headroom_boost, ulong, 0644);
6488 MODULE_PARM_DESC(l2arc_headroom_boost, "Compressed l2arc_headroom multiplier");
6489
6490 module_param(l2arc_feed_secs, ulong, 0644);
6491 MODULE_PARM_DESC(l2arc_feed_secs, "Seconds between L2ARC writing");
6492
6493 module_param(l2arc_feed_min_ms, ulong, 0644);
6494 MODULE_PARM_DESC(l2arc_feed_min_ms, "Min feed interval in milliseconds");
6495
6496 module_param(l2arc_noprefetch, int, 0644);
6497 MODULE_PARM_DESC(l2arc_noprefetch, "Skip caching prefetched buffers");
6498
6499 module_param(l2arc_nocompress, int, 0644);
6500 MODULE_PARM_DESC(l2arc_nocompress, "Skip compressing L2ARC buffers");
6501
6502 module_param(l2arc_feed_again, int, 0644);
6503 MODULE_PARM_DESC(l2arc_feed_again, "Turbo L2ARC warmup");
6504
6505 module_param(l2arc_norw, int, 0644);
6506 MODULE_PARM_DESC(l2arc_norw, "No reads during writes");
6507
6508 #endif