module/zfs/arc.c

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  23  * Copyright (c) 2011, 2014 by Delphix. All rights reserved.
  24  * Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
  25  * Copyright 2014 Nexenta Systems, Inc.  All rights reserved.
  26  */
  27
  28 /*
  29  * DVA-based Adjustable Replacement Cache
  30  *
  31  * While much of the theory of operation used here is
  32  * based on the self-tuning, low overhead replacement cache
  33  * presented by Megiddo and Modha at FAST 2003, there are some
  34  * significant differences:
  35  *
  36  * 1. The Megiddo and Modha model assumes any page is evictable.
  37  * Pages in its cache cannot be "locked" into memory.  This makes
  38  * the eviction algorithm simple: evict the last page in the list.
  39  * This also make the performance characteristics easy to reason
  40  * about.  Our cache is not so simple.  At any given moment, some
  41  * subset of the blocks in the cache are un-evictable because we
  42  * have handed out a reference to them.  Blocks are only evictable
  43  * when there are no external references active.  This makes
  44  * eviction far more problematic:  we choose to evict the evictable
  45  * blocks that are the "lowest" in the list.
  46  *
  47  * There are times when it is not possible to evict the requested
  48  * space.  In these circumstances we are unable to adjust the cache
  49  * size.  To prevent the cache growing unbounded at these times we
  50  * implement a "cache throttle" that slows the flow of new data
  51  * into the cache until we can make space available.
  52  *
  53  * 2. The Megiddo and Modha model assumes a fixed cache size.
  54  * Pages are evicted when the cache is full and there is a cache
  55  * miss.  Our model has a variable sized cache.  It grows with
  56  * high use, but also tries to react to memory pressure from the
  57  * operating system: decreasing its size when system memory is
  58  * tight.
  59  *
  60  * 3. The Megiddo and Modha model assumes a fixed page size. All
  61  * elements of the cache are therefore exactly the same size.  So
  62  * when adjusting the cache size following a cache miss, its simply
  63  * a matter of choosing a single page to evict.  In our model, we
  64  * have variable sized cache blocks (rangeing from 512 bytes to
  65  * 128K bytes).  We therefore choose a set of blocks to evict to make
  66  * space for a cache miss that approximates as closely as possible
  67  * the space used by the new block.
  68  *
  69  * See also:  "ARC: A Self-Tuning, Low Overhead Replacement Cache"
  70  * by N. Megiddo & D. Modha, FAST 2003
  71  */
  72
  73 /*
  74  * The locking model:
  75  *
  76  * A new reference to a cache buffer can be obtained in two
  77  * ways: 1) via a hash table lookup using the DVA as a key,
  78  * or 2) via one of the ARC lists.  The arc_read() interface
  79  * uses method 1, while the internal arc algorithms for
  80  * adjusting the cache use method 2.  We therefore provide two
  81  * types of locks: 1) the hash table lock array, and 2) the
  82  * arc list locks.
  83  *
  84  * Buffers do not have their own mutexes, rather they rely on the
  85  * hash table mutexes for the bulk of their protection (i.e. most
  86  * fields in the arc_buf_hdr_t are protected by these mutexes).
  87  *
  88  * buf_hash_find() returns the appropriate mutex (held) when it
  89  * locates the requested buffer in the hash table.  It returns
  90  * NULL for the mutex if the buffer was not in the table.
  91  *
  92  * buf_hash_remove() expects the appropriate hash mutex to be
  93  * already held before it is invoked.
  94  *
  95  * Each arc state also has a mutex which is used to protect the
  96  * buffer list associated with the state.  When attempting to
  97  * obtain a hash table lock while holding an arc list lock you
  98  * must use: mutex_tryenter() to avoid deadlock.  Also note that
  99  * the active state mutex must be held before the ghost state mutex.
 100  *
 101  * Arc buffers may have an associated eviction callback function.
 102  * This function will be invoked prior to removing the buffer (e.g.
 103  * in arc_do_user_evicts()).  Note however that the data associated
 104  * with the buffer may be evicted prior to the callback.  The callback
 105  * must be made with *no locks held* (to prevent deadlock).  Additionally,
 106  * the users of callbacks must ensure that their private data is
 107  * protected from simultaneous callbacks from arc_clear_callback()
 108  * and arc_do_user_evicts().
 109  *
 110  * It as also possible to register a callback which is run when the
 111  * arc_meta_limit is reached and no buffers can be safely evicted.  In
 112  * this case the arc user should drop a reference on some arc buffers so
 113  * they can be reclaimed and the arc_meta_limit honored.  For example,
 114  * when using the ZPL each dentry holds a references on a znode.  These
 115  * dentries must be pruned before the arc buffer holding the znode can
 116  * be safely evicted.
 117  *
 118  * Note that the majority of the performance stats are manipulated
 119  * with atomic operations.
 120  *
 121  * The L2ARC uses the l2arc_buflist_mtx global mutex for the following:
 122  *
 123  *      - L2ARC buflist creation
 124  *      - L2ARC buflist eviction
 125  *      - L2ARC write completion, which walks L2ARC buflists
 126  *      - ARC header destruction, as it removes from L2ARC buflists
 127  *      - ARC header release, as it removes from L2ARC buflists
 128  */
 129
 130 #include <sys/spa.h>
 131 #include <sys/zio.h>
 132 #include <sys/zio_compress.h>
 133 #include <sys/zfs_context.h>
 134 #include <sys/arc.h>
 135 #include <sys/vdev.h>
 136 #include <sys/vdev_impl.h>
 137 #include <sys/dsl_pool.h>
 138 #ifdef _KERNEL
 139 #include <sys/vmsystm.h>
 140 #include <vm/anon.h>
 141 #include <sys/fs/swapnode.h>
 142 #include <sys/zpl.h>
 143 #include <linux/mm_compat.h>
 144 #endif
 145 #include <sys/callb.h>
 146 #include <sys/kstat.h>
 147 #include <sys/dmu_tx.h>
 148 #include <zfs_fletcher.h>
 149 #include <sys/arc_impl.h>
 150 #include <sys/trace_arc.h>
 151
 152 #ifndef _KERNEL
 153 /* set with ZFS_DEBUG=watch, to enable watchpoints on frozen buffers */
 154 boolean_t arc_watch = B_FALSE;
 155 #endif
 156
 157 static kmutex_t         arc_reclaim_thr_lock;
 158 static kcondvar_t       arc_reclaim_thr_cv;     /* used to signal reclaim thr */
 159 static uint8_t          arc_thread_exit;
 160
 161 /* number of objects to prune from caches when arc_meta_limit is reached */
 162 int zfs_arc_meta_prune = 10000;
 163
 164 typedef enum arc_reclaim_strategy {
 165         ARC_RECLAIM_AGGR,               /* Aggressive reclaim strategy */
 166         ARC_RECLAIM_CONS                /* Conservative reclaim strategy */
 167 } arc_reclaim_strategy_t;
 168
 169 /*
 170  * The number of iterations through arc_evict_*() before we
 171  * drop & reacquire the lock.
 172  */
 173 int arc_evict_iterations = 100;
 174
 175 /* number of seconds before growing cache again */
 176 int zfs_arc_grow_retry = 5;
 177
 178 /* disable anon data aggressively growing arc_p */
 179 int zfs_arc_p_aggressive_disable = 1;
 180
 181 /* disable arc_p adapt dampener in arc_adapt */
 182 int zfs_arc_p_dampener_disable = 1;
 183
 184 /* log2(fraction of arc to reclaim) */
 185 int zfs_arc_shrink_shift = 5;
 186
 187 /*
 188  * minimum lifespan of a prefetch block in clock ticks
 189  * (initialized in arc_init())
 190  */
 191 int zfs_arc_min_prefetch_lifespan = HZ;
 192
 193 /* disable arc proactive arc throttle due to low memory */
 194 int zfs_arc_memory_throttle_disable = 1;
 195
 196 /* disable duplicate buffer eviction */
 197 int zfs_disable_dup_eviction = 0;
 198
 199 /* average block used to size buf_hash_table */
 200 int zfs_arc_average_blocksize = 8 * 1024; /* 8KB */
 201
 202 /*
 203  * If this percent of memory is free, don't throttle.
 204  */
 205 int arc_lotsfree_percent = 10;
 206
 207 static int arc_dead;
 208
 209 /* expiration time for arc_no_grow */
 210 static clock_t arc_grow_time = 0;
 211
 212 /*
 213  * The arc has filled available memory and has now warmed up.
 214  */
 215 static boolean_t arc_warm;
 216
 217 /*
 218  * These tunables are for performance analysis.
 219  */
 220 unsigned long zfs_arc_max = 0;
 221 unsigned long zfs_arc_min = 0;
 222 unsigned long zfs_arc_meta_limit = 0;
 223
 224 /*
 225  * Limit the number of restarts in arc_adjust_meta()
 226  */
 227 unsigned long zfs_arc_meta_adjust_restarts = 4096;
 228
 229 /* The 6 states: */
 230 static arc_state_t ARC_anon;
 231 static arc_state_t ARC_mru;
 232 static arc_state_t ARC_mru_ghost;
 233 static arc_state_t ARC_mfu;
 234 static arc_state_t ARC_mfu_ghost;
 235 static arc_state_t ARC_l2c_only;
 236
 237 typedef struct arc_stats {
 238         kstat_named_t arcstat_hits;
 239         kstat_named_t arcstat_misses;
 240         kstat_named_t arcstat_demand_data_hits;
 241         kstat_named_t arcstat_demand_data_misses;
 242         kstat_named_t arcstat_demand_metadata_hits;
 243         kstat_named_t arcstat_demand_metadata_misses;
 244         kstat_named_t arcstat_prefetch_data_hits;
 245         kstat_named_t arcstat_prefetch_data_misses;
 246         kstat_named_t arcstat_prefetch_metadata_hits;
 247         kstat_named_t arcstat_prefetch_metadata_misses;
 248         kstat_named_t arcstat_mru_hits;
 249         kstat_named_t arcstat_mru_ghost_hits;
 250         kstat_named_t arcstat_mfu_hits;
 251         kstat_named_t arcstat_mfu_ghost_hits;
 252         kstat_named_t arcstat_deleted;
 253         kstat_named_t arcstat_recycle_miss;
 254         /*
 255          * Number of buffers that could not be evicted because the hash lock
 256          * was held by another thread.  The lock may not necessarily be held
 257          * by something using the same buffer, since hash locks are shared
 258          * by multiple buffers.
 259          */
 260         kstat_named_t arcstat_mutex_miss;
 261         /*
 262          * Number of buffers skipped because they have I/O in progress, are
 263          * indrect prefetch buffers that have not lived long enough, or are
 264          * not from the spa we're trying to evict from.
 265          */
 266         kstat_named_t arcstat_evict_skip;
 267         kstat_named_t arcstat_evict_l2_cached;
 268         kstat_named_t arcstat_evict_l2_eligible;
 269         kstat_named_t arcstat_evict_l2_ineligible;
 270         kstat_named_t arcstat_hash_elements;
 271         kstat_named_t arcstat_hash_elements_max;
 272         kstat_named_t arcstat_hash_collisions;
 273         kstat_named_t arcstat_hash_chains;
 274         kstat_named_t arcstat_hash_chain_max;
 275         kstat_named_t arcstat_p;
 276         kstat_named_t arcstat_c;
 277         kstat_named_t arcstat_c_min;
 278         kstat_named_t arcstat_c_max;
 279         kstat_named_t arcstat_size;
 280         kstat_named_t arcstat_hdr_size;
 281         kstat_named_t arcstat_data_size;
 282         kstat_named_t arcstat_meta_size;
 283         kstat_named_t arcstat_other_size;
 284         kstat_named_t arcstat_anon_size;
 285         kstat_named_t arcstat_anon_evict_data;
 286         kstat_named_t arcstat_anon_evict_metadata;
 287         kstat_named_t arcstat_mru_size;
 288         kstat_named_t arcstat_mru_evict_data;
 289         kstat_named_t arcstat_mru_evict_metadata;
 290         kstat_named_t arcstat_mru_ghost_size;
 291         kstat_named_t arcstat_mru_ghost_evict_data;
 292         kstat_named_t arcstat_mru_ghost_evict_metadata;
 293         kstat_named_t arcstat_mfu_size;
 294         kstat_named_t arcstat_mfu_evict_data;
 295         kstat_named_t arcstat_mfu_evict_metadata;
 296         kstat_named_t arcstat_mfu_ghost_size;
 297         kstat_named_t arcstat_mfu_ghost_evict_data;
 298         kstat_named_t arcstat_mfu_ghost_evict_metadata;
 299         kstat_named_t arcstat_l2_hits;
 300         kstat_named_t arcstat_l2_misses;
 301         kstat_named_t arcstat_l2_feeds;
 302         kstat_named_t arcstat_l2_rw_clash;
 303         kstat_named_t arcstat_l2_read_bytes;
 304         kstat_named_t arcstat_l2_write_bytes;
 305         kstat_named_t arcstat_l2_writes_sent;
 306         kstat_named_t arcstat_l2_writes_done;
 307         kstat_named_t arcstat_l2_writes_error;
 308         kstat_named_t arcstat_l2_writes_hdr_miss;
 309         kstat_named_t arcstat_l2_evict_lock_retry;
 310         kstat_named_t arcstat_l2_evict_reading;
 311         kstat_named_t arcstat_l2_free_on_write;
 312         kstat_named_t arcstat_l2_cdata_free_on_write;
 313         kstat_named_t arcstat_l2_abort_lowmem;
 314         kstat_named_t arcstat_l2_cksum_bad;
 315         kstat_named_t arcstat_l2_io_error;
 316         kstat_named_t arcstat_l2_size;
 317         kstat_named_t arcstat_l2_asize;
 318         kstat_named_t arcstat_l2_hdr_size;
 319         kstat_named_t arcstat_l2_compress_successes;
 320         kstat_named_t arcstat_l2_compress_zeros;
 321         kstat_named_t arcstat_l2_compress_failures;
 322         kstat_named_t arcstat_memory_throttle_count;
 323         kstat_named_t arcstat_duplicate_buffers;
 324         kstat_named_t arcstat_duplicate_buffers_size;
 325         kstat_named_t arcstat_duplicate_reads;
 326         kstat_named_t arcstat_memory_direct_count;
 327         kstat_named_t arcstat_memory_indirect_count;
 328         kstat_named_t arcstat_no_grow;
 329         kstat_named_t arcstat_tempreserve;
 330         kstat_named_t arcstat_loaned_bytes;
 331         kstat_named_t arcstat_prune;
 332         kstat_named_t arcstat_meta_used;
 333         kstat_named_t arcstat_meta_limit;
 334         kstat_named_t arcstat_meta_max;
 335 } arc_stats_t;
 336
 337 static arc_stats_t arc_stats = {
 338         { "hits",                       KSTAT_DATA_UINT64 },
 339         { "misses",                     KSTAT_DATA_UINT64 },
 340         { "demand_data_hits",           KSTAT_DATA_UINT64 },
 341         { "demand_data_misses",         KSTAT_DATA_UINT64 },
 342         { "demand_metadata_hits",       KSTAT_DATA_UINT64 },
 343         { "demand_metadata_misses",     KSTAT_DATA_UINT64 },
 344         { "prefetch_data_hits",         KSTAT_DATA_UINT64 },
 345         { "prefetch_data_misses",       KSTAT_DATA_UINT64 },
 346         { "prefetch_metadata_hits",     KSTAT_DATA_UINT64 },
 347         { "prefetch_metadata_misses",   KSTAT_DATA_UINT64 },
 348         { "mru_hits",                   KSTAT_DATA_UINT64 },
 349         { "mru_ghost_hits",             KSTAT_DATA_UINT64 },
 350         { "mfu_hits",                   KSTAT_DATA_UINT64 },
 351         { "mfu_ghost_hits",             KSTAT_DATA_UINT64 },
 352         { "deleted",                    KSTAT_DATA_UINT64 },
 353         { "recycle_miss",               KSTAT_DATA_UINT64 },
 354         { "mutex_miss",                 KSTAT_DATA_UINT64 },
 355         { "evict_skip",                 KSTAT_DATA_UINT64 },
 356         { "evict_l2_cached",            KSTAT_DATA_UINT64 },
 357         { "evict_l2_eligible",          KSTAT_DATA_UINT64 },
 358         { "evict_l2_ineligible",        KSTAT_DATA_UINT64 },
 359         { "hash_elements",              KSTAT_DATA_UINT64 },
 360         { "hash_elements_max",          KSTAT_DATA_UINT64 },
 361         { "hash_collisions",            KSTAT_DATA_UINT64 },
 362         { "hash_chains",                KSTAT_DATA_UINT64 },
 363         { "hash_chain_max",             KSTAT_DATA_UINT64 },
 364         { "p",                          KSTAT_DATA_UINT64 },
 365         { "c",                          KSTAT_DATA_UINT64 },
 366         { "c_min",                      KSTAT_DATA_UINT64 },
 367         { "c_max",                      KSTAT_DATA_UINT64 },
 368         { "size",                       KSTAT_DATA_UINT64 },
 369         { "hdr_size",                   KSTAT_DATA_UINT64 },
 370         { "data_size",                  KSTAT_DATA_UINT64 },
 371         { "meta_size",                  KSTAT_DATA_UINT64 },
 372         { "other_size",                 KSTAT_DATA_UINT64 },
 373         { "anon_size",                  KSTAT_DATA_UINT64 },
 374         { "anon_evict_data",            KSTAT_DATA_UINT64 },
 375         { "anon_evict_metadata",        KSTAT_DATA_UINT64 },
 376         { "mru_size",                   KSTAT_DATA_UINT64 },
 377         { "mru_evict_data",             KSTAT_DATA_UINT64 },
 378         { "mru_evict_metadata",         KSTAT_DATA_UINT64 },
 379         { "mru_ghost_size",             KSTAT_DATA_UINT64 },
 380         { "mru_ghost_evict_data",       KSTAT_DATA_UINT64 },
 381         { "mru_ghost_evict_metadata",   KSTAT_DATA_UINT64 },
 382         { "mfu_size",                   KSTAT_DATA_UINT64 },
 383         { "mfu_evict_data",             KSTAT_DATA_UINT64 },
 384         { "mfu_evict_metadata",         KSTAT_DATA_UINT64 },
 385         { "mfu_ghost_size",             KSTAT_DATA_UINT64 },
 386         { "mfu_ghost_evict_data",       KSTAT_DATA_UINT64 },
 387         { "mfu_ghost_evict_metadata",   KSTAT_DATA_UINT64 },
 388         { "l2_hits",                    KSTAT_DATA_UINT64 },
 389         { "l2_misses",                  KSTAT_DATA_UINT64 },
 390         { "l2_feeds",                   KSTAT_DATA_UINT64 },
 391         { "l2_rw_clash",                KSTAT_DATA_UINT64 },
 392         { "l2_read_bytes",              KSTAT_DATA_UINT64 },
 393         { "l2_write_bytes",             KSTAT_DATA_UINT64 },
 394         { "l2_writes_sent",             KSTAT_DATA_UINT64 },
 395         { "l2_writes_done",             KSTAT_DATA_UINT64 },
 396         { "l2_writes_error",            KSTAT_DATA_UINT64 },
 397         { "l2_writes_hdr_miss",         KSTAT_DATA_UINT64 },
 398         { "l2_evict_lock_retry",        KSTAT_DATA_UINT64 },
 399         { "l2_evict_reading",           KSTAT_DATA_UINT64 },
 400         { "l2_free_on_write",           KSTAT_DATA_UINT64 },
 401         { "l2_cdata_free_on_write",     KSTAT_DATA_UINT64 },
 402         { "l2_abort_lowmem",            KSTAT_DATA_UINT64 },
 403         { "l2_cksum_bad",               KSTAT_DATA_UINT64 },
 404         { "l2_io_error",                KSTAT_DATA_UINT64 },
 405         { "l2_size",                    KSTAT_DATA_UINT64 },
 406         { "l2_asize",                   KSTAT_DATA_UINT64 },
 407         { "l2_hdr_size",                KSTAT_DATA_UINT64 },
 408         { "l2_compress_successes",      KSTAT_DATA_UINT64 },
 409         { "l2_compress_zeros",          KSTAT_DATA_UINT64 },
 410         { "l2_compress_failures",       KSTAT_DATA_UINT64 },
 411         { "memory_throttle_count",      KSTAT_DATA_UINT64 },
 412         { "duplicate_buffers",          KSTAT_DATA_UINT64 },
 413         { "duplicate_buffers_size",     KSTAT_DATA_UINT64 },
 414         { "duplicate_reads",            KSTAT_DATA_UINT64 },
 415         { "memory_direct_count",        KSTAT_DATA_UINT64 },
 416         { "memory_indirect_count",      KSTAT_DATA_UINT64 },
 417         { "arc_no_grow",                KSTAT_DATA_UINT64 },
 418         { "arc_tempreserve",            KSTAT_DATA_UINT64 },
 419         { "arc_loaned_bytes",           KSTAT_DATA_UINT64 },
 420         { "arc_prune",                  KSTAT_DATA_UINT64 },
 421         { "arc_meta_used",              KSTAT_DATA_UINT64 },
 422         { "arc_meta_limit",             KSTAT_DATA_UINT64 },
 423         { "arc_meta_max",               KSTAT_DATA_UINT64 },
 424 };
 425
 426 #define ARCSTAT(stat)   (arc_stats.stat.value.ui64)
 427
 428 #define ARCSTAT_INCR(stat, val) \
 429         atomic_add_64(&arc_stats.stat.value.ui64, (val))
 430
 431 #define ARCSTAT_BUMP(stat)      ARCSTAT_INCR(stat, 1)
 432 #define ARCSTAT_BUMPDOWN(stat)  ARCSTAT_INCR(stat, -1)
 433
 434 #define ARCSTAT_MAX(stat, val) {                                        \
 435         uint64_t m;                                                     \
 436         while ((val) > (m = arc_stats.stat.value.ui64) &&               \
 437             (m != atomic_cas_64(&arc_stats.stat.value.ui64, m, (val)))) \
 438                 continue;                                               \
 439 }
 440
 441 #define ARCSTAT_MAXSTAT(stat) \
 442         ARCSTAT_MAX(stat##_max, arc_stats.stat.value.ui64)
 443
 444 /*
 445  * We define a macro to allow ARC hits/misses to be easily broken down by
 446  * two separate conditions, giving a total of four different subtypes for
 447  * each of hits and misses (so eight statistics total).
 448  */
 449 #define ARCSTAT_CONDSTAT(cond1, stat1, notstat1, cond2, stat2, notstat2, stat) \
 450         if (cond1) {                                                    \
 451                 if (cond2) {                                            \
 452                         ARCSTAT_BUMP(arcstat_##stat1##_##stat2##_##stat); \
 453                 } else {                                                \
 454                         ARCSTAT_BUMP(arcstat_##stat1##_##notstat2##_##stat); \
 455                 }                                                       \
 456         } else {                                                        \
 457                 if (cond2) {                                            \
 458                         ARCSTAT_BUMP(arcstat_##notstat1##_##stat2##_##stat); \
 459                 } else {                                                \
 460                         ARCSTAT_BUMP(arcstat_##notstat1##_##notstat2##_##stat);\
 461                 }                                                       \
 462         }
 463
 464 kstat_t                 *arc_ksp;
 465 static arc_state_t      *arc_anon;
 466 static arc_state_t      *arc_mru;
 467 static arc_state_t      *arc_mru_ghost;
 468 static arc_state_t      *arc_mfu;
 469 static arc_state_t      *arc_mfu_ghost;
 470 static arc_state_t      *arc_l2c_only;
 471
 472 /*
 473  * There are several ARC variables that are critical to export as kstats --
 474  * but we don't want to have to grovel around in the kstat whenever we wish to
 475  * manipulate them.  For these variables, we therefore define them to be in
 476  * terms of the statistic variable.  This assures that we are not introducing
 477  * the possibility of inconsistency by having shadow copies of the variables,
 478  * while still allowing the code to be readable.
 479  */
 480 #define arc_size        ARCSTAT(arcstat_size)   /* actual total arc size */
 481 #define arc_p           ARCSTAT(arcstat_p)      /* target size of MRU */
 482 #define arc_c           ARCSTAT(arcstat_c)      /* target size of cache */
 483 #define arc_c_min       ARCSTAT(arcstat_c_min)  /* min target cache size */
 484 #define arc_c_max       ARCSTAT(arcstat_c_max)  /* max target cache size */
 485 #define arc_no_grow     ARCSTAT(arcstat_no_grow)
 486 #define arc_tempreserve ARCSTAT(arcstat_tempreserve)
 487 #define arc_loaned_bytes        ARCSTAT(arcstat_loaned_bytes)
 488 #define arc_meta_limit  ARCSTAT(arcstat_meta_limit) /* max size for metadata */
 489 #define arc_meta_used   ARCSTAT(arcstat_meta_used) /* size of metadata */
 490 #define arc_meta_max    ARCSTAT(arcstat_meta_max) /* max size of metadata */
 491
 492 #define L2ARC_IS_VALID_COMPRESS(_c_) \
 493         ((_c_) == ZIO_COMPRESS_LZ4 || (_c_) == ZIO_COMPRESS_EMPTY)
 494
 495 static list_t arc_prune_list;
 496 static kmutex_t arc_prune_mtx;
 497 static arc_buf_t *arc_eviction_list;
 498 static kmutex_t arc_eviction_mtx;
 499 static arc_buf_hdr_t arc_eviction_hdr;
 500 static void arc_get_data_buf(arc_buf_t *buf);
 501 static void arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock);
 502 static int arc_evict_needed(arc_buf_contents_t type);
 503 static void arc_evict_ghost(arc_state_t *state, uint64_t spa, int64_t bytes,
 504     arc_buf_contents_t type);
 505 static void arc_buf_watch(arc_buf_t *buf);
 506
 507 static boolean_t l2arc_write_eligible(uint64_t spa_guid, arc_buf_hdr_t *ab);
 508
 509 #define GHOST_STATE(state)      \
 510         ((state) == arc_mru_ghost || (state) == arc_mfu_ghost ||        \
 511         (state) == arc_l2c_only)
 512
 513 /*
 514  * Private ARC flags.  These flags are private ARC only flags that will show up
 515  * in b_flags in the arc_hdr_buf_t.  Some flags are publicly declared, and can
 516  * be passed in as arc_flags in things like arc_read.  However, these flags
 517  * should never be passed and should only be set by ARC code.  When adding new
 518  * public flags, make sure not to smash the private ones.
 519  */
 520
 521 #define ARC_IN_HASH_TABLE       (1 << 9)        /* this buffer is hashed */
 522 #define ARC_IO_IN_PROGRESS      (1 << 10)       /* I/O in progress for buf */
 523 #define ARC_IO_ERROR            (1 << 11)       /* I/O failed for buf */
 524 #define ARC_FREED_IN_READ       (1 << 12)       /* buf freed while in read */
 525 #define ARC_BUF_AVAILABLE       (1 << 13)       /* block not in active use */
 526 #define ARC_INDIRECT            (1 << 14)       /* this is an indirect block */
 527 #define ARC_FREE_IN_PROGRESS    (1 << 15)       /* hdr about to be freed */
 528 #define ARC_L2_WRITING          (1 << 16)       /* L2ARC write in progress */
 529 #define ARC_L2_EVICTED          (1 << 17)       /* evicted during I/O */
 530 #define ARC_L2_WRITE_HEAD       (1 << 18)       /* head of write list */
 531
 532 #define HDR_IN_HASH_TABLE(hdr)  ((hdr)->b_flags & ARC_IN_HASH_TABLE)
 533 #define HDR_IO_IN_PROGRESS(hdr) ((hdr)->b_flags & ARC_IO_IN_PROGRESS)
 534 #define HDR_IO_ERROR(hdr)       ((hdr)->b_flags & ARC_IO_ERROR)
 535 #define HDR_PREFETCH(hdr)       ((hdr)->b_flags & ARC_PREFETCH)
 536 #define HDR_FREED_IN_READ(hdr)  ((hdr)->b_flags & ARC_FREED_IN_READ)
 537 #define HDR_BUF_AVAILABLE(hdr)  ((hdr)->b_flags & ARC_BUF_AVAILABLE)
 538 #define HDR_FREE_IN_PROGRESS(hdr)       ((hdr)->b_flags & ARC_FREE_IN_PROGRESS)
 539 #define HDR_L2CACHE(hdr)        ((hdr)->b_flags & ARC_L2CACHE)
 540 #define HDR_L2_READING(hdr)     ((hdr)->b_flags & ARC_IO_IN_PROGRESS && \
 541                                     (hdr)->b_l2hdr != NULL)
 542 #define HDR_L2_WRITING(hdr)     ((hdr)->b_flags & ARC_L2_WRITING)
 543 #define HDR_L2_EVICTED(hdr)     ((hdr)->b_flags & ARC_L2_EVICTED)
 544 #define HDR_L2_WRITE_HEAD(hdr)  ((hdr)->b_flags & ARC_L2_WRITE_HEAD)
 545
 546 /*
 547  * Other sizes
 548  */
 549
 550 #define HDR_SIZE ((int64_t)sizeof (arc_buf_hdr_t))
 551 #define L2HDR_SIZE ((int64_t)sizeof (l2arc_buf_hdr_t))
 552
 553 /*
 554  * Hash table routines
 555  */
 556
 557 #define HT_LOCK_ALIGN   64
 558 #define HT_LOCK_PAD     (P2NPHASE(sizeof (kmutex_t), (HT_LOCK_ALIGN)))
 559
 560 struct ht_lock {
 561         kmutex_t        ht_lock;
 562 #ifdef _KERNEL
 563         unsigned char   pad[HT_LOCK_PAD];
 564 #endif
 565 };
 566
 567 #define BUF_LOCKS 8192
 568 typedef struct buf_hash_table {
 569         uint64_t ht_mask;
 570         arc_buf_hdr_t **ht_table;
 571         struct ht_lock ht_locks[BUF_LOCKS];
 572 } buf_hash_table_t;
 573
 574 static buf_hash_table_t buf_hash_table;
 575
 576 #define BUF_HASH_INDEX(spa, dva, birth) \
 577         (buf_hash(spa, dva, birth) & buf_hash_table.ht_mask)
 578 #define BUF_HASH_LOCK_NTRY(idx) (buf_hash_table.ht_locks[idx & (BUF_LOCKS-1)])
 579 #define BUF_HASH_LOCK(idx)      (&(BUF_HASH_LOCK_NTRY(idx).ht_lock))
 580 #define HDR_LOCK(hdr) \
 581         (BUF_HASH_LOCK(BUF_HASH_INDEX(hdr->b_spa, &hdr->b_dva, hdr->b_birth)))
 582
 583 uint64_t zfs_crc64_table[256];
 584
 585 /*
 586  * Level 2 ARC
 587  */
 588
 589 #define L2ARC_WRITE_SIZE        (8 * 1024 * 1024)       /* initial write max */
 590 #define L2ARC_HEADROOM          2                       /* num of writes */
 591 /*
 592  * If we discover during ARC scan any buffers to be compressed, we boost
 593  * our headroom for the next scanning cycle by this percentage multiple.
 594  */
 595 #define L2ARC_HEADROOM_BOOST    200
 596 #define L2ARC_FEED_SECS         1               /* caching interval secs */
 597 #define L2ARC_FEED_MIN_MS       200             /* min caching interval ms */
 598
 599 #define l2arc_writes_sent       ARCSTAT(arcstat_l2_writes_sent)
 600 #define l2arc_writes_done       ARCSTAT(arcstat_l2_writes_done)
 601
 602 /* L2ARC Performance Tunables */
 603 unsigned long l2arc_write_max = L2ARC_WRITE_SIZE;       /* def max write size */
 604 unsigned long l2arc_write_boost = L2ARC_WRITE_SIZE;     /* extra warmup write */
 605 unsigned long l2arc_headroom = L2ARC_HEADROOM;          /* # of dev writes */
 606 unsigned long l2arc_headroom_boost = L2ARC_HEADROOM_BOOST;
 607 unsigned long l2arc_feed_secs = L2ARC_FEED_SECS;        /* interval seconds */
 608 unsigned long l2arc_feed_min_ms = L2ARC_FEED_MIN_MS;    /* min interval msecs */
 609 int l2arc_noprefetch = B_TRUE;                  /* don't cache prefetch bufs */
 610 int l2arc_nocompress = B_FALSE;                 /* don't compress bufs */
 611 int l2arc_feed_again = B_TRUE;                  /* turbo warmup */
 612 int l2arc_norw = B_FALSE;                       /* no reads during writes */
 613
 614 /*
 615  * L2ARC Internals
 616  */
 617 static list_t L2ARC_dev_list;                   /* device list */
 618 static list_t *l2arc_dev_list;                  /* device list pointer */
 619 static kmutex_t l2arc_dev_mtx;                  /* device list mutex */
 620 static l2arc_dev_t *l2arc_dev_last;             /* last device used */
 621 static kmutex_t l2arc_buflist_mtx;              /* mutex for all buflists */
 622 static list_t L2ARC_free_on_write;              /* free after write buf list */
 623 static list_t *l2arc_free_on_write;             /* free after write list ptr */
 624 static kmutex_t l2arc_free_on_write_mtx;        /* mutex for list */
 625 static uint64_t l2arc_ndev;                     /* number of devices */
 626
 627 typedef struct l2arc_read_callback {
 628         arc_buf_t               *l2rcb_buf;             /* read buffer */
 629         spa_t                   *l2rcb_spa;             /* spa */
 630         blkptr_t                l2rcb_bp;               /* original blkptr */
 631         zbookmark_phys_t        l2rcb_zb;               /* original bookmark */
 632         int                     l2rcb_flags;            /* original flags */
 633         enum zio_compress       l2rcb_compress;         /* applied compress */
 634 } l2arc_read_callback_t;
 635
 636 struct l2arc_buf_hdr {
 637         /* protected by arc_buf_hdr  mutex */
 638         l2arc_dev_t             *b_dev;         /* L2ARC device */
 639         uint64_t                b_daddr;        /* disk address, offset byte */
 640         /* compression applied to buffer data */
 641         enum zio_compress       b_compress;
 642         /* real alloc'd buffer size depending on b_compress applied */
 643         uint32_t                b_hits;
 644         uint64_t                b_asize;
 645         /* temporary buffer holder for in-flight compressed data */
 646         void                    *b_tmp_cdata;
 647 };
 648
 649 typedef struct l2arc_data_free {
 650         /* protected by l2arc_free_on_write_mtx */
 651         void            *l2df_data;
 652         size_t          l2df_size;
 653         void            (*l2df_func)(void *, size_t);
 654         list_node_t     l2df_list_node;
 655 } l2arc_data_free_t;
 656
 657 static kmutex_t l2arc_feed_thr_lock;
 658 static kcondvar_t l2arc_feed_thr_cv;
 659 static uint8_t l2arc_thread_exit;
 660
 661 static void l2arc_read_done(zio_t *zio);
 662 static void l2arc_hdr_stat_add(void);
 663 static void l2arc_hdr_stat_remove(void);
 664
 665 static boolean_t l2arc_compress_buf(l2arc_buf_hdr_t *l2hdr);
 666 static void l2arc_decompress_zio(zio_t *zio, arc_buf_hdr_t *hdr,
 667     enum zio_compress c);
 668 static void l2arc_release_cdata_buf(arc_buf_hdr_t *ab);
 669
 670 static uint64_t
 671 buf_hash(uint64_t spa, const dva_t *dva, uint64_t birth)
 672 {
 673         uint8_t *vdva = (uint8_t *)dva;
 674         uint64_t crc = -1ULL;
 675         int i;
 676
 677         ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY);
 678
 679         for (i = 0; i < sizeof (dva_t); i++)
 680                 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ vdva[i]) & 0xFF];
 681
 682         crc ^= (spa>>8) ^ birth;
 683
 684         return (crc);
 685 }
 686
 687 #define BUF_EMPTY(buf)                                          \
 688         ((buf)->b_dva.dva_word[0] == 0 &&                       \
 689         (buf)->b_dva.dva_word[1] == 0 &&                        \
 690         (buf)->b_cksum0 == 0)
 691
 692 #define BUF_EQUAL(spa, dva, birth, buf)                         \
 693         ((buf)->b_dva.dva_word[0] == (dva)->dva_word[0]) &&     \
 694         ((buf)->b_dva.dva_word[1] == (dva)->dva_word[1]) &&     \
 695         ((buf)->b_birth == birth) && ((buf)->b_spa == spa)
 696
 697 static void
 698 buf_discard_identity(arc_buf_hdr_t *hdr)
 699 {
 700         hdr->b_dva.dva_word[0] = 0;
 701         hdr->b_dva.dva_word[1] = 0;
 702         hdr->b_birth = 0;
 703         hdr->b_cksum0 = 0;
 704 }
 705
 706 static arc_buf_hdr_t *
 707 buf_hash_find(uint64_t spa, const blkptr_t *bp, kmutex_t **lockp)
 708 {
 709         const dva_t *dva = BP_IDENTITY(bp);
 710         uint64_t birth = BP_PHYSICAL_BIRTH(bp);
 711         uint64_t idx = BUF_HASH_INDEX(spa, dva, birth);
 712         kmutex_t *hash_lock = BUF_HASH_LOCK(idx);
 713         arc_buf_hdr_t *buf;
 714
 715         mutex_enter(hash_lock);
 716         for (buf = buf_hash_table.ht_table[idx]; buf != NULL;
 717             buf = buf->b_hash_next) {
 718                 if (BUF_EQUAL(spa, dva, birth, buf)) {
 719                         *lockp = hash_lock;
 720                         return (buf);
 721                 }
 722         }
 723         mutex_exit(hash_lock);
 724         *lockp = NULL;
 725         return (NULL);
 726 }
 727
 728 /*
 729  * Insert an entry into the hash table.  If there is already an element
 730  * equal to elem in the hash table, then the already existing element
 731  * will be returned and the new element will not be inserted.
 732  * Otherwise returns NULL.
 733  */
 734 static arc_buf_hdr_t *
 735 buf_hash_insert(arc_buf_hdr_t *buf, kmutex_t **lockp)
 736 {
 737         uint64_t idx = BUF_HASH_INDEX(buf->b_spa, &buf->b_dva, buf->b_birth);
 738         kmutex_t *hash_lock = BUF_HASH_LOCK(idx);
 739         arc_buf_hdr_t *fbuf;
 740         uint32_t i;
 741
 742         ASSERT(!DVA_IS_EMPTY(&buf->b_dva));
 743         ASSERT(buf->b_birth != 0);
 744         ASSERT(!HDR_IN_HASH_TABLE(buf));
 745         *lockp = hash_lock;
 746         mutex_enter(hash_lock);
 747         for (fbuf = buf_hash_table.ht_table[idx], i = 0; fbuf != NULL;
 748             fbuf = fbuf->b_hash_next, i++) {
 749                 if (BUF_EQUAL(buf->b_spa, &buf->b_dva, buf->b_birth, fbuf))
 750                         return (fbuf);
 751         }
 752
 753         buf->b_hash_next = buf_hash_table.ht_table[idx];
 754         buf_hash_table.ht_table[idx] = buf;
 755         buf->b_flags |= ARC_IN_HASH_TABLE;
 756
 757         /* collect some hash table performance data */
 758         if (i > 0) {
 759                 ARCSTAT_BUMP(arcstat_hash_collisions);
 760                 if (i == 1)
 761                         ARCSTAT_BUMP(arcstat_hash_chains);
 762
 763                 ARCSTAT_MAX(arcstat_hash_chain_max, i);
 764         }
 765
 766         ARCSTAT_BUMP(arcstat_hash_elements);
 767         ARCSTAT_MAXSTAT(arcstat_hash_elements);
 768
 769         return (NULL);
 770 }
 771
 772 static void
 773 buf_hash_remove(arc_buf_hdr_t *buf)
 774 {
 775         arc_buf_hdr_t *fbuf, **bufp;
 776         uint64_t idx = BUF_HASH_INDEX(buf->b_spa, &buf->b_dva, buf->b_birth);
 777
 778         ASSERT(MUTEX_HELD(BUF_HASH_LOCK(idx)));
 779         ASSERT(HDR_IN_HASH_TABLE(buf));
 780
 781         bufp = &buf_hash_table.ht_table[idx];
 782         while ((fbuf = *bufp) != buf) {
 783                 ASSERT(fbuf != NULL);
 784                 bufp = &fbuf->b_hash_next;
 785         }
 786         *bufp = buf->b_hash_next;
 787         buf->b_hash_next = NULL;
 788         buf->b_flags &= ~ARC_IN_HASH_TABLE;
 789
 790         /* collect some hash table performance data */
 791         ARCSTAT_BUMPDOWN(arcstat_hash_elements);
 792
 793         if (buf_hash_table.ht_table[idx] &&
 794             buf_hash_table.ht_table[idx]->b_hash_next == NULL)
 795                 ARCSTAT_BUMPDOWN(arcstat_hash_chains);
 796 }
 797
 798 /*
 799  * Global data structures and functions for the buf kmem cache.
 800  */
 801 static kmem_cache_t *hdr_cache;
 802 static kmem_cache_t *buf_cache;
 803 static kmem_cache_t *l2arc_hdr_cache;
 804
 805 static void
 806 buf_fini(void)
 807 {
 808         int i;
 809
 810 #if defined(_KERNEL) && defined(HAVE_SPL)
 811         /*
 812          * Large allocations which do not require contiguous pages
 813          * should be using vmem_free() in the linux kernel\
 814          */
 815         vmem_free(buf_hash_table.ht_table,
 816             (buf_hash_table.ht_mask + 1) * sizeof (void *));
 817 #else
 818         kmem_free(buf_hash_table.ht_table,
 819             (buf_hash_table.ht_mask + 1) * sizeof (void *));
 820 #endif
 821         for (i = 0; i < BUF_LOCKS; i++)
 822                 mutex_destroy(&buf_hash_table.ht_locks[i].ht_lock);
 823         kmem_cache_destroy(hdr_cache);
 824         kmem_cache_destroy(buf_cache);
 825         kmem_cache_destroy(l2arc_hdr_cache);
 826 }
 827
 828 /*
 829  * Constructor callback - called when the cache is empty
 830  * and a new buf is requested.
 831  */
 832 /* ARGSUSED */
 833 static int
 834 hdr_cons(void *vbuf, void *unused, int kmflag)
 835 {
 836         arc_buf_hdr_t *buf = vbuf;
 837
 838         bzero(buf, sizeof (arc_buf_hdr_t));
 839         refcount_create(&buf->b_refcnt);
 840         cv_init(&buf->b_cv, NULL, CV_DEFAULT, NULL);
 841         mutex_init(&buf->b_freeze_lock, NULL, MUTEX_DEFAULT, NULL);
 842         list_link_init(&buf->b_arc_node);
 843         list_link_init(&buf->b_l2node);
 844         arc_space_consume(sizeof (arc_buf_hdr_t), ARC_SPACE_HDRS);
 845
 846         return (0);
 847 }
 848
 849 /* ARGSUSED */
 850 static int
 851 buf_cons(void *vbuf, void *unused, int kmflag)
 852 {
 853         arc_buf_t *buf = vbuf;
 854
 855         bzero(buf, sizeof (arc_buf_t));
 856         mutex_init(&buf->b_evict_lock, NULL, MUTEX_DEFAULT, NULL);
 857         arc_space_consume(sizeof (arc_buf_t), ARC_SPACE_HDRS);
 858
 859         return (0);
 860 }
 861
 862 /*
 863  * Destructor callback - called when a cached buf is
 864  * no longer required.
 865  */
 866 /* ARGSUSED */
 867 static void
 868 hdr_dest(void *vbuf, void *unused)
 869 {
 870         arc_buf_hdr_t *buf = vbuf;
 871
 872         ASSERT(BUF_EMPTY(buf));
 873         refcount_destroy(&buf->b_refcnt);
 874         cv_destroy(&buf->b_cv);
 875         mutex_destroy(&buf->b_freeze_lock);
 876         arc_space_return(sizeof (arc_buf_hdr_t), ARC_SPACE_HDRS);
 877 }
 878
 879 /* ARGSUSED */
 880 static void
 881 buf_dest(void *vbuf, void *unused)
 882 {
 883         arc_buf_t *buf = vbuf;
 884
 885         mutex_destroy(&buf->b_evict_lock);
 886         arc_space_return(sizeof (arc_buf_t), ARC_SPACE_HDRS);
 887 }
 888
 889 static void
 890 buf_init(void)
 891 {
 892         uint64_t *ct;
 893         uint64_t hsize = 1ULL << 12;
 894         int i, j;
 895
 896         /*
 897          * The hash table is big enough to fill all of physical memory
 898          * with an average block size of zfs_arc_average_blocksize (default 8K).
 899          * By default, the table will take up
 900          * totalmem * sizeof(void*) / 8K (1MB per GB with 8-byte pointers).
 901          */
 902         while (hsize * zfs_arc_average_blocksize < physmem * PAGESIZE)
 903                 hsize <<= 1;
 904 retry:
 905         buf_hash_table.ht_mask = hsize - 1;
 906 #if defined(_KERNEL) && defined(HAVE_SPL)
 907         /*
 908          * Large allocations which do not require contiguous pages
 909          * should be using vmem_alloc() in the linux kernel
 910          */
 911         buf_hash_table.ht_table =
 912             vmem_zalloc(hsize * sizeof (void*), KM_SLEEP);
 913 #else
 914         buf_hash_table.ht_table =
 915             kmem_zalloc(hsize * sizeof (void*), KM_NOSLEEP);
 916 #endif
 917         if (buf_hash_table.ht_table == NULL) {
 918                 ASSERT(hsize > (1ULL << 8));
 919                 hsize >>= 1;
 920                 goto retry;
 921         }
 922
 923         hdr_cache = kmem_cache_create("arc_buf_hdr_t", sizeof (arc_buf_hdr_t),
 924             0, hdr_cons, hdr_dest, NULL, NULL, NULL, 0);
 925         buf_cache = kmem_cache_create("arc_buf_t", sizeof (arc_buf_t),
 926             0, buf_cons, buf_dest, NULL, NULL, NULL, 0);
 927         l2arc_hdr_cache = kmem_cache_create("l2arc_buf_hdr_t", L2HDR_SIZE,
 928             0, NULL, NULL, NULL, NULL, NULL, 0);
 929
 930         for (i = 0; i < 256; i++)
 931                 for (ct = zfs_crc64_table + i, *ct = i, j = 8; j > 0; j--)
 932                         *ct = (*ct >> 1) ^ (-(*ct & 1) & ZFS_CRC64_POLY);
 933
 934         for (i = 0; i < BUF_LOCKS; i++) {
 935                 mutex_init(&buf_hash_table.ht_locks[i].ht_lock,
 936                     NULL, MUTEX_DEFAULT, NULL);
 937         }
 938 }
 939
 940 #define ARC_MINTIME     (hz>>4) /* 62 ms */
 941
 942 static void
 943 arc_cksum_verify(arc_buf_t *buf)
 944 {
 945         zio_cksum_t zc;
 946
 947         if (!(zfs_flags & ZFS_DEBUG_MODIFY))
 948                 return;
 949
 950         mutex_enter(&buf->b_hdr->b_freeze_lock);
 951         if (buf->b_hdr->b_freeze_cksum == NULL ||
 952             (buf->b_hdr->b_flags & ARC_IO_ERROR)) {
 953                 mutex_exit(&buf->b_hdr->b_freeze_lock);
 954                 return;
 955         }
 956         fletcher_2_native(buf->b_data, buf->b_hdr->b_size, &zc);
 957         if (!ZIO_CHECKSUM_EQUAL(*buf->b_hdr->b_freeze_cksum, zc))
 958                 panic("buffer modified while frozen!");
 959         mutex_exit(&buf->b_hdr->b_freeze_lock);
 960 }
 961
 962 static int
 963 arc_cksum_equal(arc_buf_t *buf)
 964 {
 965         zio_cksum_t zc;
 966         int equal;
 967
 968         mutex_enter(&buf->b_hdr->b_freeze_lock);
 969         fletcher_2_native(buf->b_data, buf->b_hdr->b_size, &zc);
 970         equal = ZIO_CHECKSUM_EQUAL(*buf->b_hdr->b_freeze_cksum, zc);
 971         mutex_exit(&buf->b_hdr->b_freeze_lock);
 972
 973         return (equal);
 974 }
 975
 976 static void
 977 arc_cksum_compute(arc_buf_t *buf, boolean_t force)
 978 {
 979         if (!force && !(zfs_flags & ZFS_DEBUG_MODIFY))
 980                 return;
 981
 982         mutex_enter(&buf->b_hdr->b_freeze_lock);
 983         if (buf->b_hdr->b_freeze_cksum != NULL) {
 984                 mutex_exit(&buf->b_hdr->b_freeze_lock);
 985                 return;
 986         }
 987         buf->b_hdr->b_freeze_cksum = kmem_alloc(sizeof (zio_cksum_t),
 988             KM_SLEEP);
 989         fletcher_2_native(buf->b_data, buf->b_hdr->b_size,
 990             buf->b_hdr->b_freeze_cksum);
 991         mutex_exit(&buf->b_hdr->b_freeze_lock);
 992         arc_buf_watch(buf);
 993 }
 994
 995 #ifndef _KERNEL
 996 void
 997 arc_buf_sigsegv(int sig, siginfo_t *si, void *unused)
 998 {
 999         panic("Got SIGSEGV at address: 0x%lx\n", (long) si->si_addr);
1000 }
1001 #endif
1002
1003 /* ARGSUSED */
1004 static void
1005 arc_buf_unwatch(arc_buf_t *buf)
1006 {
1007 #ifndef _KERNEL
1008         if (arc_watch) {
1009                 ASSERT0(mprotect(buf->b_data, buf->b_hdr->b_size,
1010                     PROT_READ | PROT_WRITE));
1011         }
1012 #endif
1013 }
1014
1015 /* ARGSUSED */
1016 static void
1017 arc_buf_watch(arc_buf_t *buf)
1018 {
1019 #ifndef _KERNEL
1020         if (arc_watch)
1021                 ASSERT0(mprotect(buf->b_data, buf->b_hdr->b_size, PROT_READ));
1022 #endif
1023 }
1024
1025 void
1026 arc_buf_thaw(arc_buf_t *buf)
1027 {
1028         if (zfs_flags & ZFS_DEBUG_MODIFY) {
1029                 if (buf->b_hdr->b_state != arc_anon)
1030                         panic("modifying non-anon buffer!");
1031                 if (buf->b_hdr->b_flags & ARC_IO_IN_PROGRESS)
1032                         panic("modifying buffer while i/o in progress!");
1033                 arc_cksum_verify(buf);
1034         }
1035
1036         mutex_enter(&buf->b_hdr->b_freeze_lock);
1037         if (buf->b_hdr->b_freeze_cksum != NULL) {
1038                 kmem_free(buf->b_hdr->b_freeze_cksum, sizeof (zio_cksum_t));
1039                 buf->b_hdr->b_freeze_cksum = NULL;
1040         }
1041
1042         mutex_exit(&buf->b_hdr->b_freeze_lock);
1043
1044         arc_buf_unwatch(buf);
1045 }
1046
1047 void
1048 arc_buf_freeze(arc_buf_t *buf)
1049 {
1050         kmutex_t *hash_lock;
1051
1052         if (!(zfs_flags & ZFS_DEBUG_MODIFY))
1053                 return;
1054
1055         hash_lock = HDR_LOCK(buf->b_hdr);
1056         mutex_enter(hash_lock);
1057
1058         ASSERT(buf->b_hdr->b_freeze_cksum != NULL ||
1059             buf->b_hdr->b_state == arc_anon);
1060         arc_cksum_compute(buf, B_FALSE);
1061         mutex_exit(hash_lock);
1062
1063 }
1064
1065 static void
1066 add_reference(arc_buf_hdr_t *ab, kmutex_t *hash_lock, void *tag)
1067 {
1068         ASSERT(MUTEX_HELD(hash_lock));
1069
1070         if ((refcount_add(&ab->b_refcnt, tag) == 1) &&
1071             (ab->b_state != arc_anon)) {
1072                 uint64_t delta = ab->b_size * ab->b_datacnt;
1073                 list_t *list = &ab->b_state->arcs_list[ab->b_type];
1074                 uint64_t *size = &ab->b_state->arcs_lsize[ab->b_type];
1075
1076                 ASSERT(!MUTEX_HELD(&ab->b_state->arcs_mtx));
1077                 mutex_enter(&ab->b_state->arcs_mtx);
1078                 ASSERT(list_link_active(&ab->b_arc_node));
1079                 list_remove(list, ab);
1080                 if (GHOST_STATE(ab->b_state)) {
1081                         ASSERT0(ab->b_datacnt);
1082                         ASSERT3P(ab->b_buf, ==, NULL);
1083                         delta = ab->b_size;
1084                 }
1085                 ASSERT(delta > 0);
1086                 ASSERT3U(*size, >=, delta);
1087                 atomic_add_64(size, -delta);
1088                 mutex_exit(&ab->b_state->arcs_mtx);
1089                 /* remove the prefetch flag if we get a reference */
1090                 if (ab->b_flags & ARC_PREFETCH)
1091                         ab->b_flags &= ~ARC_PREFETCH;
1092         }
1093 }
1094
1095 static int
1096 remove_reference(arc_buf_hdr_t *ab, kmutex_t *hash_lock, void *tag)
1097 {
1098         int cnt;
1099         arc_state_t *state = ab->b_state;
1100
1101         ASSERT(state == arc_anon || MUTEX_HELD(hash_lock));
1102         ASSERT(!GHOST_STATE(state));
1103
1104         if (((cnt = refcount_remove(&ab->b_refcnt, tag)) == 0) &&
1105             (state != arc_anon)) {
1106                 uint64_t *size = &state->arcs_lsize[ab->b_type];
1107
1108                 ASSERT(!MUTEX_HELD(&state->arcs_mtx));
1109                 mutex_enter(&state->arcs_mtx);
1110                 ASSERT(!list_link_active(&ab->b_arc_node));
1111                 list_insert_head(&state->arcs_list[ab->b_type], ab);
1112                 ASSERT(ab->b_datacnt > 0);
1113                 atomic_add_64(size, ab->b_size * ab->b_datacnt);
1114                 mutex_exit(&state->arcs_mtx);
1115         }
1116         return (cnt);
1117 }
1118
1119 /*
1120  * Returns detailed information about a specific arc buffer.  When the
1121  * state_index argument is set the function will calculate the arc header
1122  * list position for its arc state.  Since this requires a linear traversal
1123  * callers are strongly encourage not to do this.  However, it can be helpful
1124  * for targeted analysis so the functionality is provided.
1125  */
1126 void
1127 arc_buf_info(arc_buf_t *ab, arc_buf_info_t *abi, int state_index)
1128 {
1129         arc_buf_hdr_t *hdr = ab->b_hdr;
1130         arc_state_t *state = hdr->b_state;
1131
1132         memset(abi, 0, sizeof (arc_buf_info_t));
1133         abi->abi_flags = hdr->b_flags;
1134         abi->abi_datacnt = hdr->b_datacnt;
1135         abi->abi_state_type = state ? state->arcs_state : ARC_STATE_ANON;
1136         abi->abi_state_contents = hdr->b_type;
1137         abi->abi_state_index = -1;
1138         abi->abi_size = hdr->b_size;
1139         abi->abi_access = hdr->b_arc_access;
1140         abi->abi_mru_hits = hdr->b_mru_hits;
1141         abi->abi_mru_ghost_hits = hdr->b_mru_ghost_hits;
1142         abi->abi_mfu_hits = hdr->b_mfu_hits;
1143         abi->abi_mfu_ghost_hits = hdr->b_mfu_ghost_hits;
1144         abi->abi_holds = refcount_count(&hdr->b_refcnt);
1145
1146         if (hdr->b_l2hdr) {
1147                 abi->abi_l2arc_dattr = hdr->b_l2hdr->b_daddr;
1148                 abi->abi_l2arc_asize = hdr->b_l2hdr->b_asize;
1149                 abi->abi_l2arc_compress = hdr->b_l2hdr->b_compress;
1150                 abi->abi_l2arc_hits = hdr->b_l2hdr->b_hits;
1151         }
1152
1153         if (state && state_index && list_link_active(&hdr->b_arc_node)) {
1154                 list_t *list = &state->arcs_list[hdr->b_type];
1155                 arc_buf_hdr_t *h;
1156
1157                 mutex_enter(&state->arcs_mtx);
1158                 for (h = list_head(list); h != NULL; h = list_next(list, h)) {
1159                         abi->abi_state_index++;
1160                         if (h == hdr)
1161                                 break;
1162                 }
1163                 mutex_exit(&state->arcs_mtx);
1164         }
1165 }
1166
1167 /*
1168  * Move the supplied buffer to the indicated state.  The mutex
1169  * for the buffer must be held by the caller.
1170  */
1171 static void
1172 arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *ab, kmutex_t *hash_lock)
1173 {
1174         arc_state_t *old_state = ab->b_state;
1175         int64_t refcnt = refcount_count(&ab->b_refcnt);
1176         uint64_t from_delta, to_delta;
1177
1178         ASSERT(MUTEX_HELD(hash_lock));
1179         ASSERT3P(new_state, !=, old_state);
1180         ASSERT(refcnt == 0 || ab->b_datacnt > 0);
1181         ASSERT(ab->b_datacnt == 0 || !GHOST_STATE(new_state));
1182         ASSERT(ab->b_datacnt <= 1 || old_state != arc_anon);
1183
1184         from_delta = to_delta = ab->b_datacnt * ab->b_size;
1185
1186         /*
1187          * If this buffer is evictable, transfer it from the
1188          * old state list to the new state list.
1189          */
1190         if (refcnt == 0) {
1191                 if (old_state != arc_anon) {
1192                         int use_mutex = !MUTEX_HELD(&old_state->arcs_mtx);
1193                         uint64_t *size = &old_state->arcs_lsize[ab->b_type];
1194
1195                         if (use_mutex)
1196                                 mutex_enter(&old_state->arcs_mtx);
1197
1198                         ASSERT(list_link_active(&ab->b_arc_node));
1199                         list_remove(&old_state->arcs_list[ab->b_type], ab);
1200
1201                         /*
1202                          * If prefetching out of the ghost cache,
1203                          * we will have a non-zero datacnt.
1204                          */
1205                         if (GHOST_STATE(old_state) && ab->b_datacnt == 0) {
1206                                 /* ghost elements have a ghost size */
1207                                 ASSERT(ab->b_buf == NULL);
1208                                 from_delta = ab->b_size;
1209                         }
1210                         ASSERT3U(*size, >=, from_delta);
1211                         atomic_add_64(size, -from_delta);
1212
1213                         if (use_mutex)
1214                                 mutex_exit(&old_state->arcs_mtx);
1215                 }
1216                 if (new_state != arc_anon) {
1217                         int use_mutex = !MUTEX_HELD(&new_state->arcs_mtx);
1218                         uint64_t *size = &new_state->arcs_lsize[ab->b_type];
1219
1220                         if (use_mutex)
1221                                 mutex_enter(&new_state->arcs_mtx);
1222
1223                         list_insert_head(&new_state->arcs_list[ab->b_type], ab);
1224
1225                         /* ghost elements have a ghost size */
1226                         if (GHOST_STATE(new_state)) {
1227                                 ASSERT(ab->b_datacnt == 0);
1228                                 ASSERT(ab->b_buf == NULL);
1229                                 to_delta = ab->b_size;
1230                         }
1231                         atomic_add_64(size, to_delta);
1232
1233                         if (use_mutex)
1234                                 mutex_exit(&new_state->arcs_mtx);
1235                 }
1236         }
1237
1238         ASSERT(!BUF_EMPTY(ab));
1239         if (new_state == arc_anon && HDR_IN_HASH_TABLE(ab))
1240                 buf_hash_remove(ab);
1241
1242         /* adjust state sizes */
1243         if (to_delta)
1244                 atomic_add_64(&new_state->arcs_size, to_delta);
1245         if (from_delta) {
1246                 ASSERT3U(old_state->arcs_size, >=, from_delta);
1247                 atomic_add_64(&old_state->arcs_size, -from_delta);
1248         }
1249         ab->b_state = new_state;
1250
1251         /* adjust l2arc hdr stats */
1252         if (new_state == arc_l2c_only)
1253                 l2arc_hdr_stat_add();
1254         else if (old_state == arc_l2c_only)
1255                 l2arc_hdr_stat_remove();
1256 }
1257
1258 void
1259 arc_space_consume(uint64_t space, arc_space_type_t type)
1260 {
1261         ASSERT(type >= 0 && type < ARC_SPACE_NUMTYPES);
1262
1263         switch (type) {
1264         default:
1265                 break;
1266         case ARC_SPACE_DATA:
1267                 ARCSTAT_INCR(arcstat_data_size, space);
1268                 break;
1269         case ARC_SPACE_META:
1270                 ARCSTAT_INCR(arcstat_meta_size, space);
1271                 break;
1272         case ARC_SPACE_OTHER:
1273                 ARCSTAT_INCR(arcstat_other_size, space);
1274                 break;
1275         case ARC_SPACE_HDRS:
1276                 ARCSTAT_INCR(arcstat_hdr_size, space);
1277                 break;
1278         case ARC_SPACE_L2HDRS:
1279                 ARCSTAT_INCR(arcstat_l2_hdr_size, space);
1280                 break;
1281         }
1282
1283         if (type != ARC_SPACE_DATA) {
1284                 ARCSTAT_INCR(arcstat_meta_used, space);
1285                 if (arc_meta_max < arc_meta_used)
1286                         arc_meta_max = arc_meta_used;
1287         }
1288
1289         atomic_add_64(&arc_size, space);
1290 }
1291
1292 void
1293 arc_space_return(uint64_t space, arc_space_type_t type)
1294 {
1295         ASSERT(type >= 0 && type < ARC_SPACE_NUMTYPES);
1296
1297         switch (type) {
1298         default:
1299                 break;
1300         case ARC_SPACE_DATA:
1301                 ARCSTAT_INCR(arcstat_data_size, -space);
1302                 break;
1303         case ARC_SPACE_META:
1304                 ARCSTAT_INCR(arcstat_meta_size, -space);
1305                 break;
1306         case ARC_SPACE_OTHER:
1307                 ARCSTAT_INCR(arcstat_other_size, -space);
1308                 break;
1309         case ARC_SPACE_HDRS:
1310                 ARCSTAT_INCR(arcstat_hdr_size, -space);
1311                 break;
1312         case ARC_SPACE_L2HDRS:
1313                 ARCSTAT_INCR(arcstat_l2_hdr_size, -space);
1314                 break;
1315         }
1316
1317         if (type != ARC_SPACE_DATA) {
1318                 ASSERT(arc_meta_used >= space);
1319                 ARCSTAT_INCR(arcstat_meta_used, -space);
1320         }
1321
1322         ASSERT(arc_size >= space);
1323         atomic_add_64(&arc_size, -space);
1324 }
1325
1326 arc_buf_t *
1327 arc_buf_alloc(spa_t *spa, uint64_t size, void *tag, arc_buf_contents_t type)
1328 {
1329         arc_buf_hdr_t *hdr;
1330         arc_buf_t *buf;
1331
1332         VERIFY3U(size, <=, SPA_MAXBLOCKSIZE);
1333         hdr = kmem_cache_alloc(hdr_cache, KM_PUSHPAGE);
1334         ASSERT(BUF_EMPTY(hdr));
1335         hdr->b_size = size;
1336         hdr->b_type = type;
1337         hdr->b_spa = spa_load_guid(spa);
1338         hdr->b_state = arc_anon;
1339         hdr->b_arc_access = 0;
1340         hdr->b_mru_hits = 0;
1341         hdr->b_mru_ghost_hits = 0;
1342         hdr->b_mfu_hits = 0;
1343         hdr->b_mfu_ghost_hits = 0;
1344         hdr->b_l2_hits = 0;
1345         buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE);
1346         buf->b_hdr = hdr;
1347         buf->b_data = NULL;
1348         buf->b_efunc = NULL;
1349         buf->b_private = NULL;
1350         buf->b_next = NULL;
1351         hdr->b_buf = buf;
1352         arc_get_data_buf(buf);
1353         hdr->b_datacnt = 1;
1354         hdr->b_flags = 0;
1355         ASSERT(refcount_is_zero(&hdr->b_refcnt));
1356         (void) refcount_add(&hdr->b_refcnt, tag);
1357
1358         return (buf);
1359 }
1360
1361 static char *arc_onloan_tag = "onloan";
1362
1363 /*
1364  * Loan out an anonymous arc buffer. Loaned buffers are not counted as in
1365  * flight data by arc_tempreserve_space() until they are "returned". Loaned
1366  * buffers must be returned to the arc before they can be used by the DMU or
1367  * freed.
1368  */
1369 arc_buf_t *
1370 arc_loan_buf(spa_t *spa, uint64_t size)
1371 {
1372         arc_buf_t *buf;
1373
1374         buf = arc_buf_alloc(spa, size, arc_onloan_tag, ARC_BUFC_DATA);
1375
1376         atomic_add_64(&arc_loaned_bytes, size);
1377         return (buf);
1378 }
1379
1380 /*
1381  * Return a loaned arc buffer to the arc.
1382  */
1383 void
1384 arc_return_buf(arc_buf_t *buf, void *tag)
1385 {
1386         arc_buf_hdr_t *hdr = buf->b_hdr;
1387
1388         ASSERT(buf->b_data != NULL);
1389         (void) refcount_add(&hdr->b_refcnt, tag);
1390         (void) refcount_remove(&hdr->b_refcnt, arc_onloan_tag);
1391
1392         atomic_add_64(&arc_loaned_bytes, -hdr->b_size);
1393 }
1394
1395 /* Detach an arc_buf from a dbuf (tag) */
1396 void
1397 arc_loan_inuse_buf(arc_buf_t *buf, void *tag)
1398 {
1399         arc_buf_hdr_t *hdr;
1400
1401         ASSERT(buf->b_data != NULL);
1402         hdr = buf->b_hdr;
1403         (void) refcount_add(&hdr->b_refcnt, arc_onloan_tag);
1404         (void) refcount_remove(&hdr->b_refcnt, tag);
1405         buf->b_efunc = NULL;
1406         buf->b_private = NULL;
1407
1408         atomic_add_64(&arc_loaned_bytes, hdr->b_size);
1409 }
1410
1411 static arc_buf_t *
1412 arc_buf_clone(arc_buf_t *from)
1413 {
1414         arc_buf_t *buf;
1415         arc_buf_hdr_t *hdr = from->b_hdr;
1416         uint64_t size = hdr->b_size;
1417
1418         ASSERT(hdr->b_state != arc_anon);
1419
1420         buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE);
1421         buf->b_hdr = hdr;
1422         buf->b_data = NULL;
1423         buf->b_efunc = NULL;
1424         buf->b_private = NULL;
1425         buf->b_next = hdr->b_buf;
1426         hdr->b_buf = buf;
1427         arc_get_data_buf(buf);
1428         bcopy(from->b_data, buf->b_data, size);
1429
1430         /*
1431          * This buffer already exists in the arc so create a duplicate
1432          * copy for the caller.  If the buffer is associated with user data
1433          * then track the size and number of duplicates.  These stats will be
1434          * updated as duplicate buffers are created and destroyed.
1435          */
1436         if (hdr->b_type == ARC_BUFC_DATA) {
1437                 ARCSTAT_BUMP(arcstat_duplicate_buffers);
1438                 ARCSTAT_INCR(arcstat_duplicate_buffers_size, size);
1439         }
1440         hdr->b_datacnt += 1;
1441         return (buf);
1442 }
1443
1444 void
1445 arc_buf_add_ref(arc_buf_t *buf, void* tag)
1446 {
1447         arc_buf_hdr_t *hdr;
1448         kmutex_t *hash_lock;
1449
1450         /*
1451          * Check to see if this buffer is evicted.  Callers
1452          * must verify b_data != NULL to know if the add_ref
1453          * was successful.
1454          */
1455         mutex_enter(&buf->b_evict_lock);
1456         if (buf->b_data == NULL) {
1457                 mutex_exit(&buf->b_evict_lock);
1458                 return;
1459         }
1460         hash_lock = HDR_LOCK(buf->b_hdr);
1461         mutex_enter(hash_lock);
1462         hdr = buf->b_hdr;
1463         ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
1464         mutex_exit(&buf->b_evict_lock);
1465
1466         ASSERT(hdr->b_state == arc_mru || hdr->b_state == arc_mfu);
1467         add_reference(hdr, hash_lock, tag);
1468         DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr);
1469         arc_access(hdr, hash_lock);
1470         mutex_exit(hash_lock);
1471         ARCSTAT_BUMP(arcstat_hits);
1472         ARCSTAT_CONDSTAT(!(hdr->b_flags & ARC_PREFETCH),
1473             demand, prefetch, hdr->b_type != ARC_BUFC_METADATA,
1474             data, metadata, hits);
1475 }
1476
1477 static void
1478 arc_buf_free_on_write(void *data, size_t size,
1479     void (*free_func)(void *, size_t))
1480 {
1481         l2arc_data_free_t *df;
1482
1483         df = kmem_alloc(sizeof (l2arc_data_free_t), KM_SLEEP);
1484         df->l2df_data = data;
1485         df->l2df_size = size;
1486         df->l2df_func = free_func;
1487         mutex_enter(&l2arc_free_on_write_mtx);
1488         list_insert_head(l2arc_free_on_write, df);
1489         mutex_exit(&l2arc_free_on_write_mtx);
1490 }
1491
1492 /*
1493  * Free the arc data buffer.  If it is an l2arc write in progress,
1494  * the buffer is placed on l2arc_free_on_write to be freed later.
1495  */
1496 static void
1497 arc_buf_data_free(arc_buf_t *buf, void (*free_func)(void *, size_t))
1498 {
1499         arc_buf_hdr_t *hdr = buf->b_hdr;
1500
1501         if (HDR_L2_WRITING(hdr)) {
1502                 arc_buf_free_on_write(buf->b_data, hdr->b_size, free_func);
1503                 ARCSTAT_BUMP(arcstat_l2_free_on_write);
1504         } else {
1505                 free_func(buf->b_data, hdr->b_size);
1506         }
1507 }
1508
1509 /*
1510  * Free up buf->b_data and if 'remove' is set, then pull the
1511  * arc_buf_t off of the the arc_buf_hdr_t's list and free it.
1512  */
1513 static void
1514 arc_buf_l2_cdata_free(arc_buf_hdr_t *hdr)
1515 {
1516         l2arc_buf_hdr_t *l2hdr = hdr->b_l2hdr;
1517
1518         ASSERT(MUTEX_HELD(&l2arc_buflist_mtx));
1519
1520         if (l2hdr->b_tmp_cdata == NULL)
1521                 return;
1522
1523         ASSERT(HDR_L2_WRITING(hdr));
1524         arc_buf_free_on_write(l2hdr->b_tmp_cdata, hdr->b_size,
1525             zio_data_buf_free);
1526         ARCSTAT_BUMP(arcstat_l2_cdata_free_on_write);
1527         l2hdr->b_tmp_cdata = NULL;
1528 }
1529
1530 static void
1531 arc_buf_destroy(arc_buf_t *buf, boolean_t recycle, boolean_t remove)
1532 {
1533         arc_buf_t **bufp;
1534
1535         /* free up data associated with the buf */
1536         if (buf->b_data) {
1537                 arc_state_t *state = buf->b_hdr->b_state;
1538                 uint64_t size = buf->b_hdr->b_size;
1539                 arc_buf_contents_t type = buf->b_hdr->b_type;
1540
1541                 arc_cksum_verify(buf);
1542                 arc_buf_unwatch(buf);
1543
1544                 if (!recycle) {
1545                         if (type == ARC_BUFC_METADATA) {
1546                                 arc_buf_data_free(buf, zio_buf_free);
1547                                 arc_space_return(size, ARC_SPACE_META);
1548                         } else {
1549                                 ASSERT(type == ARC_BUFC_DATA);
1550                                 arc_buf_data_free(buf, zio_data_buf_free);
1551                                 arc_space_return(size, ARC_SPACE_DATA);
1552                         }
1553                 }
1554                 if (list_link_active(&buf->b_hdr->b_arc_node)) {
1555                         uint64_t *cnt = &state->arcs_lsize[type];
1556
1557                         ASSERT(refcount_is_zero(&buf->b_hdr->b_refcnt));
1558                         ASSERT(state != arc_anon);
1559
1560                         ASSERT3U(*cnt, >=, size);
1561                         atomic_add_64(cnt, -size);
1562                 }
1563                 ASSERT3U(state->arcs_size, >=, size);
1564                 atomic_add_64(&state->arcs_size, -size);
1565                 buf->b_data = NULL;
1566
1567                 /*
1568                  * If we're destroying a duplicate buffer make sure
1569                  * that the appropriate statistics are updated.
1570                  */
1571                 if (buf->b_hdr->b_datacnt > 1 &&
1572                     buf->b_hdr->b_type == ARC_BUFC_DATA) {
1573                         ARCSTAT_BUMPDOWN(arcstat_duplicate_buffers);
1574                         ARCSTAT_INCR(arcstat_duplicate_buffers_size, -size);
1575                 }
1576                 ASSERT(buf->b_hdr->b_datacnt > 0);
1577                 buf->b_hdr->b_datacnt -= 1;
1578         }
1579
1580         /* only remove the buf if requested */
1581         if (!remove)
1582                 return;
1583
1584         /* remove the buf from the hdr list */
1585         for (bufp = &buf->b_hdr->b_buf; *bufp != buf; bufp = &(*bufp)->b_next)
1586                 continue;
1587         *bufp = buf->b_next;
1588         buf->b_next = NULL;
1589
1590         ASSERT(buf->b_efunc == NULL);
1591
1592         /* clean up the buf */
1593         buf->b_hdr = NULL;
1594         kmem_cache_free(buf_cache, buf);
1595 }
1596
1597 static void
1598 arc_hdr_destroy(arc_buf_hdr_t *hdr)
1599 {
1600         l2arc_buf_hdr_t *l2hdr = hdr->b_l2hdr;
1601
1602         ASSERT(refcount_is_zero(&hdr->b_refcnt));
1603         ASSERT3P(hdr->b_state, ==, arc_anon);
1604         ASSERT(!HDR_IO_IN_PROGRESS(hdr));
1605
1606         if (l2hdr != NULL) {
1607                 boolean_t buflist_held = MUTEX_HELD(&l2arc_buflist_mtx);
1608                 /*
1609                  * To prevent arc_free() and l2arc_evict() from
1610                  * attempting to free the same buffer at the same time,
1611                  * a FREE_IN_PROGRESS flag is given to arc_free() to
1612                  * give it priority.  l2arc_evict() can't destroy this
1613                  * header while we are waiting on l2arc_buflist_mtx.
1614                  *
1615                  * The hdr may be removed from l2ad_buflist before we
1616                  * grab l2arc_buflist_mtx, so b_l2hdr is rechecked.
1617                  */
1618                 if (!buflist_held) {
1619                         mutex_enter(&l2arc_buflist_mtx);
1620                         l2hdr = hdr->b_l2hdr;
1621                 }
1622
1623                 if (l2hdr != NULL) {
1624                         list_remove(l2hdr->b_dev->l2ad_buflist, hdr);
1625                         arc_buf_l2_cdata_free(hdr);
1626                         ARCSTAT_INCR(arcstat_l2_size, -hdr->b_size);
1627                         ARCSTAT_INCR(arcstat_l2_asize, -l2hdr->b_asize);
1628                         vdev_space_update(l2hdr->b_dev->l2ad_vdev,
1629                             -l2hdr->b_asize, 0, 0);
1630                         kmem_cache_free(l2arc_hdr_cache, l2hdr);
1631                         arc_space_return(L2HDR_SIZE, ARC_SPACE_L2HDRS);
1632                         if (hdr->b_state == arc_l2c_only)
1633                                 l2arc_hdr_stat_remove();
1634                         hdr->b_l2hdr = NULL;
1635                 }
1636
1637                 if (!buflist_held)
1638                         mutex_exit(&l2arc_buflist_mtx);
1639         }
1640
1641         if (!BUF_EMPTY(hdr)) {
1642                 ASSERT(!HDR_IN_HASH_TABLE(hdr));
1643                 buf_discard_identity(hdr);
1644         }
1645         while (hdr->b_buf) {
1646                 arc_buf_t *buf = hdr->b_buf;
1647
1648                 if (buf->b_efunc) {
1649                         mutex_enter(&arc_eviction_mtx);
1650                         mutex_enter(&buf->b_evict_lock);
1651                         ASSERT(buf->b_hdr != NULL);
1652                         arc_buf_destroy(hdr->b_buf, FALSE, FALSE);
1653                         hdr->b_buf = buf->b_next;
1654                         buf->b_hdr = &arc_eviction_hdr;
1655                         buf->b_next = arc_eviction_list;
1656                         arc_eviction_list = buf;
1657                         mutex_exit(&buf->b_evict_lock);
1658                         mutex_exit(&arc_eviction_mtx);
1659                 } else {
1660                         arc_buf_destroy(hdr->b_buf, FALSE, TRUE);
1661                 }
1662         }
1663         if (hdr->b_freeze_cksum != NULL) {
1664                 kmem_free(hdr->b_freeze_cksum, sizeof (zio_cksum_t));
1665                 hdr->b_freeze_cksum = NULL;
1666         }
1667
1668         ASSERT(!list_link_active(&hdr->b_arc_node));
1669         ASSERT3P(hdr->b_hash_next, ==, NULL);
1670         ASSERT3P(hdr->b_acb, ==, NULL);
1671         kmem_cache_free(hdr_cache, hdr);
1672 }
1673
1674 void
1675 arc_buf_free(arc_buf_t *buf, void *tag)
1676 {
1677         arc_buf_hdr_t *hdr = buf->b_hdr;
1678         int hashed = hdr->b_state != arc_anon;
1679
1680         ASSERT(buf->b_efunc == NULL);
1681         ASSERT(buf->b_data != NULL);
1682
1683         if (hashed) {
1684                 kmutex_t *hash_lock = HDR_LOCK(hdr);
1685
1686                 mutex_enter(hash_lock);
1687                 hdr = buf->b_hdr;
1688                 ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
1689
1690                 (void) remove_reference(hdr, hash_lock, tag);
1691                 if (hdr->b_datacnt > 1) {
1692                         arc_buf_destroy(buf, FALSE, TRUE);
1693                 } else {
1694                         ASSERT(buf == hdr->b_buf);
1695                         ASSERT(buf->b_efunc == NULL);
1696                         hdr->b_flags |= ARC_BUF_AVAILABLE;
1697                 }
1698                 mutex_exit(hash_lock);
1699         } else if (HDR_IO_IN_PROGRESS(hdr)) {
1700                 int destroy_hdr;
1701                 /*
1702                  * We are in the middle of an async write.  Don't destroy
1703                  * this buffer unless the write completes before we finish
1704                  * decrementing the reference count.
1705                  */
1706                 mutex_enter(&arc_eviction_mtx);
1707                 (void) remove_reference(hdr, NULL, tag);
1708                 ASSERT(refcount_is_zero(&hdr->b_refcnt));
1709                 destroy_hdr = !HDR_IO_IN_PROGRESS(hdr);
1710                 mutex_exit(&arc_eviction_mtx);
1711                 if (destroy_hdr)
1712                         arc_hdr_destroy(hdr);
1713         } else {
1714                 if (remove_reference(hdr, NULL, tag) > 0)
1715                         arc_buf_destroy(buf, FALSE, TRUE);
1716                 else
1717                         arc_hdr_destroy(hdr);
1718         }
1719 }
1720
1721 boolean_t
1722 arc_buf_remove_ref(arc_buf_t *buf, void* tag)
1723 {
1724         arc_buf_hdr_t *hdr = buf->b_hdr;
1725         kmutex_t *hash_lock = NULL;
1726         boolean_t no_callback = (buf->b_efunc == NULL);
1727
1728         if (hdr->b_state == arc_anon) {
1729                 ASSERT(hdr->b_datacnt == 1);
1730                 arc_buf_free(buf, tag);
1731                 return (no_callback);
1732         }
1733
1734         hash_lock = HDR_LOCK(hdr);
1735         mutex_enter(hash_lock);
1736         hdr = buf->b_hdr;
1737         ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
1738         ASSERT(hdr->b_state != arc_anon);
1739         ASSERT(buf->b_data != NULL);
1740
1741         (void) remove_reference(hdr, hash_lock, tag);
1742         if (hdr->b_datacnt > 1) {
1743                 if (no_callback)
1744                         arc_buf_destroy(buf, FALSE, TRUE);
1745         } else if (no_callback) {
1746                 ASSERT(hdr->b_buf == buf && buf->b_next == NULL);
1747                 ASSERT(buf->b_efunc == NULL);
1748                 hdr->b_flags |= ARC_BUF_AVAILABLE;
1749         }
1750         ASSERT(no_callback || hdr->b_datacnt > 1 ||
1751             refcount_is_zero(&hdr->b_refcnt));
1752         mutex_exit(hash_lock);
1753         return (no_callback);
1754 }
1755
1756 uint64_t
1757 arc_buf_size(arc_buf_t *buf)
1758 {
1759         return (buf->b_hdr->b_size);
1760 }
1761
1762 /*
1763  * Called from the DMU to determine if the current buffer should be
1764  * evicted. In order to ensure proper locking, the eviction must be initiated
1765  * from the DMU. Return true if the buffer is associated with user data and
1766  * duplicate buffers still exist.
1767  */
1768 boolean_t
1769 arc_buf_eviction_needed(arc_buf_t *buf)
1770 {
1771         arc_buf_hdr_t *hdr;
1772         boolean_t evict_needed = B_FALSE;
1773
1774         if (zfs_disable_dup_eviction)
1775                 return (B_FALSE);
1776
1777         mutex_enter(&buf->b_evict_lock);
1778         hdr = buf->b_hdr;
1779         if (hdr == NULL) {
1780                 /*
1781                  * We are in arc_do_user_evicts(); let that function
1782                  * perform the eviction.
1783                  */
1784                 ASSERT(buf->b_data == NULL);
1785                 mutex_exit(&buf->b_evict_lock);
1786                 return (B_FALSE);
1787         } else if (buf->b_data == NULL) {
1788                 /*
1789                  * We have already been added to the arc eviction list;
1790                  * recommend eviction.
1791                  */
1792                 ASSERT3P(hdr, ==, &arc_eviction_hdr);
1793                 mutex_exit(&buf->b_evict_lock);
1794                 return (B_TRUE);
1795         }
1796
1797         if (hdr->b_datacnt > 1 && hdr->b_type == ARC_BUFC_DATA)
1798                 evict_needed = B_TRUE;
1799
1800         mutex_exit(&buf->b_evict_lock);
1801         return (evict_needed);
1802 }
1803
1804 /*
1805  * Evict buffers from list until we've removed the specified number of
1806  * bytes.  Move the removed buffers to the appropriate evict state.
1807  * If the recycle flag is set, then attempt to "recycle" a buffer:
1808  * - look for a buffer to evict that is `bytes' long.
1809  * - return the data block from this buffer rather than freeing it.
1810  * This flag is used by callers that are trying to make space for a
1811  * new buffer in a full arc cache.
1812  *
1813  * This function makes a "best effort".  It skips over any buffers
1814  * it can't get a hash_lock on, and so may not catch all candidates.
1815  * It may also return without evicting as much space as requested.
1816  */
1817 static void *
1818 arc_evict(arc_state_t *state, uint64_t spa, int64_t bytes, boolean_t recycle,
1819     arc_buf_contents_t type)
1820 {
1821         arc_state_t *evicted_state;
1822         uint64_t bytes_evicted = 0, skipped = 0, missed = 0;
1823         arc_buf_hdr_t *ab, *ab_prev = NULL;
1824         list_t *list = &state->arcs_list[type];
1825         kmutex_t *hash_lock;
1826         boolean_t have_lock;
1827         void *stolen = NULL;
1828         arc_buf_hdr_t *marker;
1829         int count = 0;
1830
1831         ASSERT(state == arc_mru || state == arc_mfu);
1832
1833         evicted_state = (state == arc_mru) ? arc_mru_ghost : arc_mfu_ghost;
1834
1835         marker = kmem_zalloc(sizeof (arc_buf_hdr_t), KM_SLEEP);
1836
1837 top:
1838         mutex_enter(&state->arcs_mtx);
1839         mutex_enter(&evicted_state->arcs_mtx);
1840
1841         for (ab = list_tail(list); ab; ab = ab_prev) {
1842                 ab_prev = list_prev(list, ab);
1843                 /* prefetch buffers have a minimum lifespan */
1844                 if (HDR_IO_IN_PROGRESS(ab) ||
1845                     (spa && ab->b_spa != spa) ||
1846                     (ab->b_flags & (ARC_PREFETCH|ARC_INDIRECT) &&
1847                     ddi_get_lbolt() - ab->b_arc_access <
1848                     zfs_arc_min_prefetch_lifespan)) {
1849                         skipped++;
1850                         continue;
1851                 }
1852                 /* "lookahead" for better eviction candidate */
1853                 if (recycle && ab->b_size != bytes &&
1854                     ab_prev && ab_prev->b_size == bytes)
1855                         continue;
1856
1857                 /* ignore markers */
1858                 if (ab->b_spa == 0)
1859                         continue;
1860
1861                 /*
1862                  * It may take a long time to evict all the bufs requested.
1863                  * To avoid blocking all arc activity, periodically drop
1864                  * the arcs_mtx and give other threads a chance to run
1865                  * before reacquiring the lock.
1866                  *
1867                  * If we are looking for a buffer to recycle, we are in
1868                  * the hot code path, so don't sleep.
1869                  */
1870                 if (!recycle && count++ > arc_evict_iterations) {
1871                         list_insert_after(list, ab, marker);
1872                         mutex_exit(&evicted_state->arcs_mtx);
1873                         mutex_exit(&state->arcs_mtx);
1874                         kpreempt(KPREEMPT_SYNC);
1875                         mutex_enter(&state->arcs_mtx);
1876                         mutex_enter(&evicted_state->arcs_mtx);
1877                         ab_prev = list_prev(list, marker);
1878                         list_remove(list, marker);
1879                         count = 0;
1880                         continue;
1881                 }
1882
1883                 hash_lock = HDR_LOCK(ab);
1884                 have_lock = MUTEX_HELD(hash_lock);
1885                 if (have_lock || mutex_tryenter(hash_lock)) {
1886                         ASSERT0(refcount_count(&ab->b_refcnt));
1887                         ASSERT(ab->b_datacnt > 0);
1888                         while (ab->b_buf) {
1889                                 arc_buf_t *buf = ab->b_buf;
1890                                 if (!mutex_tryenter(&buf->b_evict_lock)) {
1891                                         missed += 1;
1892                                         break;
1893                                 }
1894                                 if (buf->b_data) {
1895                                         bytes_evicted += ab->b_size;
1896                                         if (recycle && ab->b_type == type &&
1897                                             ab->b_size == bytes &&
1898                                             !HDR_L2_WRITING(ab)) {
1899                                                 stolen = buf->b_data;
1900                                                 recycle = FALSE;
1901                                         }
1902                                 }
1903                                 if (buf->b_efunc) {
1904                                         mutex_enter(&arc_eviction_mtx);
1905                                         arc_buf_destroy(buf,
1906                                             buf->b_data == stolen, FALSE);
1907                                         ab->b_buf = buf->b_next;
1908                                         buf->b_hdr = &arc_eviction_hdr;
1909                                         buf->b_next = arc_eviction_list;
1910                                         arc_eviction_list = buf;
1911                                         mutex_exit(&arc_eviction_mtx);
1912                                         mutex_exit(&buf->b_evict_lock);
1913                                 } else {
1914                                         mutex_exit(&buf->b_evict_lock);
1915                                         arc_buf_destroy(buf,
1916                                             buf->b_data == stolen, TRUE);
1917                                 }
1918                         }
1919
1920                         if (ab->b_l2hdr) {
1921                                 ARCSTAT_INCR(arcstat_evict_l2_cached,
1922                                     ab->b_size);
1923                         } else {
1924                                 if (l2arc_write_eligible(ab->b_spa, ab)) {
1925                                         ARCSTAT_INCR(arcstat_evict_l2_eligible,
1926                                             ab->b_size);
1927                                 } else {
1928                                         ARCSTAT_INCR(
1929                                             arcstat_evict_l2_ineligible,
1930                                             ab->b_size);
1931                                 }
1932                         }
1933
1934                         if (ab->b_datacnt == 0) {
1935                                 arc_change_state(evicted_state, ab, hash_lock);
1936                                 ASSERT(HDR_IN_HASH_TABLE(ab));
1937                                 ab->b_flags |= ARC_IN_HASH_TABLE;
1938                                 ab->b_flags &= ~ARC_BUF_AVAILABLE;
1939                                 DTRACE_PROBE1(arc__evict, arc_buf_hdr_t *, ab);
1940                         }
1941                         if (!have_lock)
1942                                 mutex_exit(hash_lock);
1943                         if (bytes >= 0 && bytes_evicted >= bytes)
1944                                 break;
1945                 } else {
1946                         missed += 1;
1947                 }
1948         }
1949
1950         mutex_exit(&evicted_state->arcs_mtx);
1951         mutex_exit(&state->arcs_mtx);
1952
1953         if (list == &state->arcs_list[ARC_BUFC_DATA] &&
1954             (bytes < 0 || bytes_evicted < bytes)) {
1955                 /* Prevent second pass from recycling metadata into data */
1956                 recycle = FALSE;
1957                 type = ARC_BUFC_METADATA;
1958                 list = &state->arcs_list[type];
1959                 goto top;
1960         }
1961
1962         kmem_free(marker, sizeof (arc_buf_hdr_t));
1963
1964         if (bytes_evicted < bytes)
1965                 dprintf("only evicted %lld bytes from %x\n",
1966                     (longlong_t)bytes_evicted, state->arcs_state);
1967
1968         if (skipped)
1969                 ARCSTAT_INCR(arcstat_evict_skip, skipped);
1970
1971         if (missed)
1972                 ARCSTAT_INCR(arcstat_mutex_miss, missed);
1973
1974         /*
1975          * Note: we have just evicted some data into the ghost state,
1976          * potentially putting the ghost size over the desired size.  Rather
1977          * that evicting from the ghost list in this hot code path, leave
1978          * this chore to the arc_reclaim_thread().
1979          */
1980
1981         return (stolen);
1982 }
1983
1984 /*
1985  * Remove buffers from list until we've removed the specified number of
1986  * bytes.  Destroy the buffers that are removed.
1987  */
1988 static void
1989 arc_evict_ghost(arc_state_t *state, uint64_t spa, int64_t bytes,
1990     arc_buf_contents_t type)
1991 {
1992         arc_buf_hdr_t *ab, *ab_prev;
1993         arc_buf_hdr_t *marker;
1994         list_t *list = &state->arcs_list[type];
1995         kmutex_t *hash_lock;
1996         uint64_t bytes_deleted = 0;
1997         uint64_t bufs_skipped = 0;
1998         int count = 0;
1999
2000         ASSERT(GHOST_STATE(state));
2001
2002         marker = kmem_zalloc(sizeof (arc_buf_hdr_t), KM_SLEEP);
2003
2004 top:
2005         mutex_enter(&state->arcs_mtx);
2006         for (ab = list_tail(list); ab; ab = ab_prev) {
2007                 ab_prev = list_prev(list, ab);
2008                 if (ab->b_type > ARC_BUFC_NUMTYPES)
2009                         panic("invalid ab=%p", (void *)ab);
2010                 if (spa && ab->b_spa != spa)
2011                         continue;
2012
2013                 /* ignore markers */
2014                 if (ab->b_spa == 0)
2015                         continue;
2016
2017                 hash_lock = HDR_LOCK(ab);
2018                 /* caller may be trying to modify this buffer, skip it */
2019                 if (MUTEX_HELD(hash_lock))
2020                         continue;
2021
2022                 /*
2023                  * It may take a long time to evict all the bufs requested.
2024                  * To avoid blocking all arc activity, periodically drop
2025                  * the arcs_mtx and give other threads a chance to run
2026                  * before reacquiring the lock.
2027                  */
2028                 if (count++ > arc_evict_iterations) {
2029                         list_insert_after(list, ab, marker);
2030                         mutex_exit(&state->arcs_mtx);
2031                         kpreempt(KPREEMPT_SYNC);
2032                         mutex_enter(&state->arcs_mtx);
2033                         ab_prev = list_prev(list, marker);
2034                         list_remove(list, marker);
2035                         count = 0;
2036                         continue;
2037                 }
2038                 if (mutex_tryenter(hash_lock)) {
2039                         ASSERT(!HDR_IO_IN_PROGRESS(ab));
2040                         ASSERT(ab->b_buf == NULL);
2041                         ARCSTAT_BUMP(arcstat_deleted);
2042                         bytes_deleted += ab->b_size;
2043
2044                         if (ab->b_l2hdr != NULL) {
2045                                 /*
2046                                  * This buffer is cached on the 2nd Level ARC;
2047                                  * don't destroy the header.
2048                                  */
2049                                 arc_change_state(arc_l2c_only, ab, hash_lock);
2050                                 mutex_exit(hash_lock);
2051                         } else {
2052                                 arc_change_state(arc_anon, ab, hash_lock);
2053                                 mutex_exit(hash_lock);
2054                                 arc_hdr_destroy(ab);
2055                         }
2056
2057                         DTRACE_PROBE1(arc__delete, arc_buf_hdr_t *, ab);
2058                         if (bytes >= 0 && bytes_deleted >= bytes)
2059                                 break;
2060                 } else if (bytes < 0) {
2061                         /*
2062                          * Insert a list marker and then wait for the
2063                          * hash lock to become available. Once its
2064                          * available, restart from where we left off.
2065                          */
2066                         list_insert_after(list, ab, marker);
2067                         mutex_exit(&state->arcs_mtx);
2068                         mutex_enter(hash_lock);
2069                         mutex_exit(hash_lock);
2070                         mutex_enter(&state->arcs_mtx);
2071                         ab_prev = list_prev(list, marker);
2072                         list_remove(list, marker);
2073                 } else {
2074                         bufs_skipped += 1;
2075                 }
2076         }
2077         mutex_exit(&state->arcs_mtx);
2078
2079         if (list == &state->arcs_list[ARC_BUFC_DATA] &&
2080             (bytes < 0 || bytes_deleted < bytes)) {
2081                 list = &state->arcs_list[ARC_BUFC_METADATA];
2082                 goto top;
2083         }
2084
2085         kmem_free(marker, sizeof (arc_buf_hdr_t));
2086
2087         if (bufs_skipped) {
2088                 ARCSTAT_INCR(arcstat_mutex_miss, bufs_skipped);
2089                 ASSERT(bytes >= 0);
2090         }
2091
2092         if (bytes_deleted < bytes)
2093                 dprintf("only deleted %lld bytes from %p\n",
2094                     (longlong_t)bytes_deleted, state);
2095 }
2096
2097 static void
2098 arc_adjust(void)
2099 {
2100         int64_t adjustment, delta;
2101
2102         /*
2103          * Adjust MRU size
2104          */
2105
2106         adjustment = MIN((int64_t)(arc_size - arc_c),
2107             (int64_t)(arc_anon->arcs_size + arc_mru->arcs_size - arc_p));
2108
2109         if (adjustment > 0 && arc_mru->arcs_size > 0) {
2110                 delta = MIN(arc_mru->arcs_size, adjustment);
2111                 (void) arc_evict(arc_mru, 0, delta, FALSE, ARC_BUFC_DATA);
2112         }
2113
2114         /*
2115          * Adjust MFU size
2116          */
2117
2118         adjustment = arc_size - arc_c;
2119
2120         if (adjustment > 0 && arc_mfu->arcs_size > 0) {
2121                 delta = MIN(arc_mfu->arcs_size, adjustment);
2122                 (void) arc_evict(arc_mfu, 0, delta, FALSE, ARC_BUFC_DATA);
2123         }
2124
2125         /*
2126          * Adjust ghost lists
2127          */
2128
2129         adjustment = arc_mru->arcs_size + arc_mru_ghost->arcs_size - arc_c;
2130
2131         if (adjustment > 0 && arc_mru_ghost->arcs_size > 0) {
2132                 delta = MIN(arc_mru_ghost->arcs_size, adjustment);
2133                 arc_evict_ghost(arc_mru_ghost, 0, delta, ARC_BUFC_DATA);
2134         }
2135
2136         adjustment =
2137             arc_mru_ghost->arcs_size + arc_mfu_ghost->arcs_size - arc_c;
2138
2139         if (adjustment > 0 && arc_mfu_ghost->arcs_size > 0) {
2140                 delta = MIN(arc_mfu_ghost->arcs_size, adjustment);
2141                 arc_evict_ghost(arc_mfu_ghost, 0, delta, ARC_BUFC_DATA);
2142         }
2143 }
2144
2145 /*
2146  * Request that arc user drop references so that N bytes can be released
2147  * from the cache.  This provides a mechanism to ensure the arc can honor
2148  * the arc_meta_limit and reclaim buffers which are pinned in the cache
2149  * by higher layers.  (i.e. the zpl)
2150  */
2151 static void
2152 arc_do_user_prune(int64_t adjustment)
2153 {
2154         arc_prune_func_t *func;
2155         void *private;
2156         arc_prune_t *cp, *np;
2157
2158         mutex_enter(&arc_prune_mtx);
2159
2160         cp = list_head(&arc_prune_list);
2161         while (cp != NULL) {
2162                 func = cp->p_pfunc;
2163                 private = cp->p_private;
2164                 np = list_next(&arc_prune_list, cp);
2165                 refcount_add(&cp->p_refcnt, func);
2166                 mutex_exit(&arc_prune_mtx);
2167
2168                 if (func != NULL)
2169                         func(adjustment, private);
2170
2171                 mutex_enter(&arc_prune_mtx);
2172
2173                 /* User removed prune callback concurrently with execution */
2174                 if (refcount_remove(&cp->p_refcnt, func) == 0) {
2175                         ASSERT(!list_link_active(&cp->p_node));
2176                         refcount_destroy(&cp->p_refcnt);
2177                         kmem_free(cp, sizeof (*cp));
2178                 }
2179
2180                 cp = np;
2181         }
2182
2183         ARCSTAT_BUMP(arcstat_prune);
2184         mutex_exit(&arc_prune_mtx);
2185 }
2186
2187 static void
2188 arc_do_user_evicts(void)
2189 {
2190         mutex_enter(&arc_eviction_mtx);
2191         while (arc_eviction_list != NULL) {
2192                 arc_buf_t *buf = arc_eviction_list;
2193                 arc_eviction_list = buf->b_next;
2194                 mutex_enter(&buf->b_evict_lock);
2195                 buf->b_hdr = NULL;
2196                 mutex_exit(&buf->b_evict_lock);
2197                 mutex_exit(&arc_eviction_mtx);
2198
2199                 if (buf->b_efunc != NULL)
2200                         VERIFY0(buf->b_efunc(buf->b_private));
2201
2202                 buf->b_efunc = NULL;
2203                 buf->b_private = NULL;
2204                 kmem_cache_free(buf_cache, buf);
2205                 mutex_enter(&arc_eviction_mtx);
2206         }
2207         mutex_exit(&arc_eviction_mtx);
2208 }
2209
2210 /*
2211  * The goal of this function is to evict enough meta data buffers from the
2212  * ARC in order to enforce the arc_meta_limit.  Achieving this is slightly
2213  * more complicated than it appears because it is common for data buffers
2214  * to have holds on meta data buffers.  In addition, dnode meta data buffers
2215  * will be held by the dnodes in the block preventing them from being freed.
2216  * This means we can't simply traverse the ARC and expect to always find
2217  * enough unheld meta data buffer to release.
2218  *
2219  * Therefore, this function has been updated to make alternating passes
2220  * over the ARC releasing data buffers and then newly unheld meta data
2221  * buffers.  This ensures forward progress is maintained and arc_meta_used
2222  * will decrease.  Normally this is sufficient, but if required the ARC
2223  * will call the registered prune callbacks causing dentry and inodes to
2224  * be dropped from the VFS cache.  This will make dnode meta data buffers
2225  * available for reclaim.
2226  */
2227 static void
2228 arc_adjust_meta(void)
2229 {
2230         int64_t adjustmnt, delta, prune = 0;
2231         arc_buf_contents_t type = ARC_BUFC_DATA;
2232         unsigned long restarts = zfs_arc_meta_adjust_restarts;
2233
2234 restart:
2235         /*
2236          * This slightly differs than the way we evict from the mru in
2237          * arc_adjust because we don't have a "target" value (i.e. no
2238          * "meta" arc_p). As a result, I think we can completely
2239          * cannibalize the metadata in the MRU before we evict the
2240          * metadata from the MFU. I think we probably need to implement a
2241          * "metadata arc_p" value to do this properly.
2242          */
2243         adjustmnt = arc_meta_used - arc_meta_limit;
2244
2245         if (adjustmnt > 0 && arc_mru->arcs_lsize[type] > 0) {
2246                 delta = MIN(arc_mru->arcs_lsize[type], adjustmnt);
2247                 arc_evict(arc_mru, 0, delta, FALSE, type);
2248                 adjustmnt -= delta;
2249         }
2250
2251         /*
2252          * We can't afford to recalculate adjustmnt here. If we do,
2253          * new metadata buffers can sneak into the MRU or ANON lists,
2254          * thus penalize the MFU metadata. Although the fudge factor is
2255          * small, it has been empirically shown to be significant for
2256          * certain workloads (e.g. creating many empty directories). As
2257          * such, we use the original calculation for adjustmnt, and
2258          * simply decrement the amount of data evicted from the MRU.
2259          */
2260
2261         if (adjustmnt > 0 && arc_mfu->arcs_lsize[type] > 0) {
2262                 delta = MIN(arc_mfu->arcs_lsize[type], adjustmnt);
2263                 arc_evict(arc_mfu, 0, delta, FALSE, type);
2264         }
2265
2266         adjustmnt = arc_meta_used - arc_meta_limit;
2267
2268         if (adjustmnt > 0 && arc_mru_ghost->arcs_lsize[type] > 0) {
2269                 delta = MIN(adjustmnt,
2270                     arc_mru_ghost->arcs_lsize[type]);
2271                 arc_evict_ghost(arc_mru_ghost, 0, delta, type);
2272                 adjustmnt -= delta;
2273         }
2274
2275         if (adjustmnt > 0 && arc_mfu_ghost->arcs_lsize[type] > 0) {
2276                 delta = MIN(adjustmnt,
2277                     arc_mfu_ghost->arcs_lsize[type]);
2278                 arc_evict_ghost(arc_mfu_ghost, 0, delta, type);
2279         }
2280
2281         /*
2282          * If after attempting to make the requested adjustment to the ARC
2283          * the meta limit is still being exceeded then request that the
2284          * higher layers drop some cached objects which have holds on ARC
2285          * meta buffers.  Requests to the upper layers will be made with
2286          * increasingly large scan sizes until the ARC is below the limit.
2287          */
2288         if (arc_meta_used > arc_meta_limit) {
2289                 if (type == ARC_BUFC_DATA) {
2290                         type = ARC_BUFC_METADATA;
2291                 } else {
2292                         type = ARC_BUFC_DATA;
2293
2294                         if (zfs_arc_meta_prune) {
2295                                 prune += zfs_arc_meta_prune;
2296                                 arc_do_user_prune(prune);
2297                         }
2298                 }
2299
2300                 if (restarts > 0) {
2301                         restarts--;
2302                         goto restart;
2303                 }
2304         }
2305 }
2306
2307 /*
2308  * Flush all *evictable* data from the cache for the given spa.
2309  * NOTE: this will not touch "active" (i.e. referenced) data.
2310  */
2311 void
2312 arc_flush(spa_t *spa)
2313 {
2314         uint64_t guid = 0;
2315
2316         if (spa)
2317                 guid = spa_load_guid(spa);
2318
2319         while (list_head(&arc_mru->arcs_list[ARC_BUFC_DATA])) {
2320                 (void) arc_evict(arc_mru, guid, -1, FALSE, ARC_BUFC_DATA);
2321                 if (spa)
2322                         break;
2323         }
2324         while (list_head(&arc_mru->arcs_list[ARC_BUFC_METADATA])) {
2325                 (void) arc_evict(arc_mru, guid, -1, FALSE, ARC_BUFC_METADATA);
2326                 if (spa)
2327                         break;
2328         }
2329         while (list_head(&arc_mfu->arcs_list[ARC_BUFC_DATA])) {
2330                 (void) arc_evict(arc_mfu, guid, -1, FALSE, ARC_BUFC_DATA);
2331                 if (spa)
2332                         break;
2333         }
2334         while (list_head(&arc_mfu->arcs_list[ARC_BUFC_METADATA])) {
2335                 (void) arc_evict(arc_mfu, guid, -1, FALSE, ARC_BUFC_METADATA);
2336                 if (spa)
2337                         break;
2338         }
2339
2340         arc_evict_ghost(arc_mru_ghost, guid, -1, ARC_BUFC_DATA);
2341         arc_evict_ghost(arc_mfu_ghost, guid, -1, ARC_BUFC_DATA);
2342
2343         mutex_enter(&arc_reclaim_thr_lock);
2344         arc_do_user_evicts();
2345         mutex_exit(&arc_reclaim_thr_lock);
2346         ASSERT(spa || arc_eviction_list == NULL);
2347 }
2348
2349 void
2350 arc_shrink(uint64_t bytes)
2351 {
2352         if (arc_c > arc_c_min) {
2353                 uint64_t to_free;
2354
2355                 to_free = bytes ? bytes : arc_c >> zfs_arc_shrink_shift;
2356
2357                 if (arc_c > arc_c_min + to_free)
2358                         atomic_add_64(&arc_c, -to_free);
2359                 else
2360                         arc_c = arc_c_min;
2361
2362                 to_free = bytes ? bytes : arc_p >> zfs_arc_shrink_shift;
2363
2364                 if (arc_p > to_free)
2365                         atomic_add_64(&arc_p, -to_free);
2366                 else
2367                         arc_p = 0;
2368
2369                 if (arc_c > arc_size)
2370                         arc_c = MAX(arc_size, arc_c_min);
2371                 if (arc_p > arc_c)
2372                         arc_p = (arc_c >> 1);
2373                 ASSERT(arc_c >= arc_c_min);
2374                 ASSERT((int64_t)arc_p >= 0);
2375         }
2376
2377         if (arc_size > arc_c)
2378                 arc_adjust();
2379 }
2380
2381 static void
2382 arc_kmem_reap_now(arc_reclaim_strategy_t strat, uint64_t bytes)
2383 {
2384         size_t                  i;
2385         kmem_cache_t            *prev_cache = NULL;
2386         kmem_cache_t            *prev_data_cache = NULL;
2387         extern kmem_cache_t     *zio_buf_cache[];
2388         extern kmem_cache_t     *zio_data_buf_cache[];
2389
2390         /*
2391          * An aggressive reclamation will shrink the cache size as well as
2392          * reap free buffers from the arc kmem caches.
2393          */
2394         if (strat == ARC_RECLAIM_AGGR)
2395                 arc_shrink(bytes);
2396
2397         for (i = 0; i < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; i++) {
2398                 if (zio_buf_cache[i] != prev_cache) {
2399                         prev_cache = zio_buf_cache[i];
2400                         kmem_cache_reap_now(zio_buf_cache[i]);
2401                 }
2402                 if (zio_data_buf_cache[i] != prev_data_cache) {
2403                         prev_data_cache = zio_data_buf_cache[i];
2404                         kmem_cache_reap_now(zio_data_buf_cache[i]);
2405                 }
2406         }
2407
2408         kmem_cache_reap_now(buf_cache);
2409         kmem_cache_reap_now(hdr_cache);
2410 }
2411
2412 /*
2413  * Unlike other ZFS implementations this thread is only responsible for
2414  * adapting the target ARC size on Linux.  The responsibility for memory
2415  * reclamation has been entirely delegated to the arc_shrinker_func()
2416  * which is registered with the VM.  To reflect this change in behavior
2417  * the arc_reclaim thread has been renamed to arc_adapt.
2418  */
2419 static void
2420 arc_adapt_thread(void)
2421 {
2422         callb_cpr_t             cpr;
2423         fstrans_cookie_t        cookie;
2424
2425         CALLB_CPR_INIT(&cpr, &arc_reclaim_thr_lock, callb_generic_cpr, FTAG);
2426
2427         cookie = spl_fstrans_mark();
2428         mutex_enter(&arc_reclaim_thr_lock);
2429         while (arc_thread_exit == 0) {
2430 #ifndef _KERNEL
2431                 arc_reclaim_strategy_t  last_reclaim = ARC_RECLAIM_CONS;
2432
2433                 if (spa_get_random(100) == 0) {
2434
2435                         if (arc_no_grow) {
2436                                 if (last_reclaim == ARC_RECLAIM_CONS) {
2437                                         last_reclaim = ARC_RECLAIM_AGGR;
2438                                 } else {
2439                                         last_reclaim = ARC_RECLAIM_CONS;
2440                                 }
2441                         } else {
2442                                 arc_no_grow = TRUE;
2443                                 last_reclaim = ARC_RECLAIM_AGGR;
2444                                 membar_producer();
2445                         }
2446
2447                         /* reset the growth delay for every reclaim */
2448                         arc_grow_time = ddi_get_lbolt() +
2449                             (zfs_arc_grow_retry * hz);
2450
2451                         arc_kmem_reap_now(last_reclaim, 0);
2452                         arc_warm = B_TRUE;
2453                 }
2454 #endif /* !_KERNEL */
2455
2456                 /* No recent memory pressure allow the ARC to grow. */
2457                 if (arc_no_grow &&
2458                     ddi_time_after_eq(ddi_get_lbolt(), arc_grow_time))
2459                         arc_no_grow = FALSE;
2460
2461                 arc_adjust_meta();
2462
2463                 arc_adjust();
2464
2465                 if (arc_eviction_list != NULL)
2466                         arc_do_user_evicts();
2467
2468                 /* block until needed, or one second, whichever is shorter */
2469                 CALLB_CPR_SAFE_BEGIN(&cpr);
2470                 (void) cv_timedwait_interruptible(&arc_reclaim_thr_cv,
2471                     &arc_reclaim_thr_lock, (ddi_get_lbolt() + hz));
2472                 CALLB_CPR_SAFE_END(&cpr, &arc_reclaim_thr_lock);
2473
2474
2475                 /* Allow the module options to be changed */
2476                 if (zfs_arc_max > 64 << 20 &&
2477                     zfs_arc_max < physmem * PAGESIZE &&
2478                     zfs_arc_max != arc_c_max)
2479                         arc_c_max = zfs_arc_max;
2480
2481                 if (zfs_arc_min > 0 &&
2482                     zfs_arc_min < arc_c_max &&
2483                     zfs_arc_min != arc_c_min)
2484                         arc_c_min = zfs_arc_min;
2485
2486                 if (zfs_arc_meta_limit > 0 &&
2487                     zfs_arc_meta_limit <= arc_c_max &&
2488                     zfs_arc_meta_limit != arc_meta_limit)
2489                         arc_meta_limit = zfs_arc_meta_limit;
2490
2491
2492
2493         }
2494
2495         arc_thread_exit = 0;
2496         cv_broadcast(&arc_reclaim_thr_cv);
2497         CALLB_CPR_EXIT(&cpr);           /* drops arc_reclaim_thr_lock */
2498         spl_fstrans_unmark(cookie);
2499         thread_exit();
2500 }
2501
2502 #ifdef _KERNEL
2503 /*
2504  * Determine the amount of memory eligible for eviction contained in the
2505  * ARC. All clean data reported by the ghost lists can always be safely
2506  * evicted. Due to arc_c_min, the same does not hold for all clean data
2507  * contained by the regular mru and mfu lists.
2508  *
2509  * In the case of the regular mru and mfu lists, we need to report as
2510  * much clean data as possible, such that evicting that same reported
2511  * data will not bring arc_size below arc_c_min. Thus, in certain
2512  * circumstances, the total amount of clean data in the mru and mfu
2513  * lists might not actually be evictable.
2514  *
2515  * The following two distinct cases are accounted for:
2516  *
2517  * 1. The sum of the amount of dirty data contained by both the mru and
2518  *    mfu lists, plus the ARC's other accounting (e.g. the anon list),
2519  *    is greater than or equal to arc_c_min.
2520  *    (i.e. amount of dirty data >= arc_c_min)
2521  *
2522  *    This is the easy case; all clean data contained by the mru and mfu
2523  *    lists is evictable. Evicting all clean data can only drop arc_size
2524  *    to the amount of dirty data, which is greater than arc_c_min.
2525  *
2526  * 2. The sum of the amount of dirty data contained by both the mru and
2527  *    mfu lists, plus the ARC's other accounting (e.g. the anon list),
2528  *    is less than arc_c_min.
2529  *    (i.e. arc_c_min > amount of dirty data)
2530  *
2531  *    2.1. arc_size is greater than or equal arc_c_min.
2532  *         (i.e. arc_size >= arc_c_min > amount of dirty data)
2533  *
2534  *         In this case, not all clean data from the regular mru and mfu
2535  *         lists is actually evictable; we must leave enough clean data
2536  *         to keep arc_size above arc_c_min. Thus, the maximum amount of
2537  *         evictable data from the two lists combined, is exactly the
2538  *         difference between arc_size and arc_c_min.
2539  *
2540  *    2.2. arc_size is less than arc_c_min
2541  *         (i.e. arc_c_min > arc_size > amount of dirty data)
2542  *
2543  *         In this case, none of the data contained in the mru and mfu
2544  *         lists is evictable, even if it's clean. Since arc_size is
2545  *         already below arc_c_min, evicting any more would only
2546  *         increase this negative difference.
2547  */
2548 static uint64_t
2549 arc_evictable_memory(void) {
2550         uint64_t arc_clean =
2551             arc_mru->arcs_lsize[ARC_BUFC_DATA] +
2552             arc_mru->arcs_lsize[ARC_BUFC_METADATA] +
2553             arc_mfu->arcs_lsize[ARC_BUFC_DATA] +
2554             arc_mfu->arcs_lsize[ARC_BUFC_METADATA];
2555         uint64_t ghost_clean =
2556             arc_mru_ghost->arcs_lsize[ARC_BUFC_DATA] +
2557             arc_mru_ghost->arcs_lsize[ARC_BUFC_METADATA] +
2558             arc_mfu_ghost->arcs_lsize[ARC_BUFC_DATA] +
2559             arc_mfu_ghost->arcs_lsize[ARC_BUFC_METADATA];
2560         uint64_t arc_dirty = MAX((int64_t)arc_size - (int64_t)arc_clean, 0);
2561
2562         if (arc_dirty >= arc_c_min)
2563                 return (ghost_clean + arc_clean);
2564
2565         return (ghost_clean + MAX((int64_t)arc_size - (int64_t)arc_c_min, 0));
2566 }
2567
2568 /*
2569  * If sc->nr_to_scan is zero, the caller is requesting a query of the
2570  * number of objects which can potentially be freed.  If it is nonzero,
2571  * the request is to free that many objects.
2572  *
2573  * Linux kernels >= 3.12 have the count_objects and scan_objects callbacks
2574  * in struct shrinker and also require the shrinker to return the number
2575  * of objects freed.
2576  *
2577  * Older kernels require the shrinker to return the number of freeable
2578  * objects following the freeing of nr_to_free.
2579  */
2580 static spl_shrinker_t
2581 __arc_shrinker_func(struct shrinker *shrink, struct shrink_control *sc)
2582 {
2583         int64_t pages;
2584
2585         /* The arc is considered warm once reclaim has occurred */
2586         if (unlikely(arc_warm == B_FALSE))
2587                 arc_warm = B_TRUE;
2588
2589         /* Return the potential number of reclaimable pages */
2590         pages = btop((int64_t)arc_evictable_memory());
2591         if (sc->nr_to_scan == 0)
2592                 return (pages);
2593
2594         /* Not allowed to perform filesystem reclaim */
2595         if (!(sc->gfp_mask & __GFP_FS))
2596                 return (SHRINK_STOP);
2597
2598         /* Reclaim in progress */
2599         if (mutex_tryenter(&arc_reclaim_thr_lock) == 0)
2600                 return (SHRINK_STOP);
2601
2602         /*
2603          * Evict the requested number of pages by shrinking arc_c the
2604          * requested amount.  If there is nothing left to evict just
2605          * reap whatever we can from the various arc slabs.
2606          */
2607         if (pages > 0) {
2608                 arc_kmem_reap_now(ARC_RECLAIM_AGGR, ptob(sc->nr_to_scan));
2609
2610 #ifdef HAVE_SPLIT_SHRINKER_CALLBACK
2611                 pages = MAX(pages - btop(arc_evictable_memory()), 0);
2612 #else
2613                 pages = btop(arc_evictable_memory());
2614 #endif
2615         } else {
2616                 arc_kmem_reap_now(ARC_RECLAIM_CONS, ptob(sc->nr_to_scan));
2617                 pages = SHRINK_STOP;
2618         }
2619
2620         /*
2621          * When direct reclaim is observed it usually indicates a rapid
2622          * increase in memory pressure.  This occurs because the kswapd
2623          * threads were unable to asynchronously keep enough free memory
2624          * available.  In this case set arc_no_grow to briefly pause arc
2625          * growth to avoid compounding the memory pressure.
2626          */
2627         if (current_is_kswapd()) {
2628                 ARCSTAT_BUMP(arcstat_memory_indirect_count);
2629         } else {
2630                 arc_no_grow = B_TRUE;
2631                 arc_grow_time = ddi_get_lbolt() + (zfs_arc_grow_retry * hz);
2632                 ARCSTAT_BUMP(arcstat_memory_direct_count);
2633         }
2634
2635         mutex_exit(&arc_reclaim_thr_lock);
2636
2637         return (pages);
2638 }
2639 SPL_SHRINKER_CALLBACK_WRAPPER(arc_shrinker_func);
2640
2641 SPL_SHRINKER_DECLARE(arc_shrinker, arc_shrinker_func, DEFAULT_SEEKS);
2642 #endif /* _KERNEL */
2643
2644 /*
2645  * Adapt arc info given the number of bytes we are trying to add and
2646  * the state that we are comming from.  This function is only called
2647  * when we are adding new content to the cache.
2648  */
2649 static void
2650 arc_adapt(int bytes, arc_state_t *state)
2651 {
2652         int mult;
2653
2654         if (state == arc_l2c_only)
2655                 return;
2656
2657         ASSERT(bytes > 0);
2658         /*
2659          * Adapt the target size of the MRU list:
2660          *      - if we just hit in the MRU ghost list, then increase
2661          *        the target size of the MRU list.
2662          *      - if we just hit in the MFU ghost list, then increase
2663          *        the target size of the MFU list by decreasing the
2664          *        target size of the MRU list.
2665          */
2666         if (state == arc_mru_ghost) {
2667                 mult = ((arc_mru_ghost->arcs_size >= arc_mfu_ghost->arcs_size) ?
2668                     1 : (arc_mfu_ghost->arcs_size/arc_mru_ghost->arcs_size));
2669
2670                 if (!zfs_arc_p_dampener_disable)
2671                         mult = MIN(mult, 10); /* avoid wild arc_p adjustment */
2672
2673                 arc_p = MIN(arc_c, arc_p + bytes * mult);
2674         } else if (state == arc_mfu_ghost) {
2675                 uint64_t delta;
2676
2677                 mult = ((arc_mfu_ghost->arcs_size >= arc_mru_ghost->arcs_size) ?
2678                     1 : (arc_mru_ghost->arcs_size/arc_mfu_ghost->arcs_size));
2679
2680                 if (!zfs_arc_p_dampener_disable)
2681                         mult = MIN(mult, 10);
2682
2683                 delta = MIN(bytes * mult, arc_p);
2684                 arc_p = MAX(0, arc_p - delta);
2685         }
2686         ASSERT((int64_t)arc_p >= 0);
2687
2688         if (arc_no_grow)
2689                 return;
2690
2691         if (arc_c >= arc_c_max)
2692                 return;
2693
2694         /*
2695          * If we're within (2 * maxblocksize) bytes of the target
2696          * cache size, increment the target cache size
2697          */
2698         if (arc_size > arc_c - (2ULL << SPA_MAXBLOCKSHIFT)) {
2699                 atomic_add_64(&arc_c, (int64_t)bytes);
2700                 if (arc_c > arc_c_max)
2701                         arc_c = arc_c_max;
2702                 else if (state == arc_anon)
2703                         atomic_add_64(&arc_p, (int64_t)bytes);
2704                 if (arc_p > arc_c)
2705                         arc_p = arc_c;
2706         }
2707         ASSERT((int64_t)arc_p >= 0);
2708 }
2709
2710 /*
2711  * Check if the cache has reached its limits and eviction is required
2712  * prior to insert.
2713  */
2714 static int
2715 arc_evict_needed(arc_buf_contents_t type)
2716 {
2717         if (type == ARC_BUFC_METADATA && arc_meta_used >= arc_meta_limit)
2718                 return (1);
2719
2720         if (arc_no_grow)
2721                 return (1);
2722
2723         return (arc_size > arc_c);
2724 }
2725
2726 /*
2727  * The buffer, supplied as the first argument, needs a data block.
2728  * So, if we are at cache max, determine which cache should be victimized.
2729  * We have the following cases:
2730  *
2731  * 1. Insert for MRU, p > sizeof(arc_anon + arc_mru) ->
2732  * In this situation if we're out of space, but the resident size of the MFU is
2733  * under the limit, victimize the MFU cache to satisfy this insertion request.
2734  *
2735  * 2. Insert for MRU, p <= sizeof(arc_anon + arc_mru) ->
2736  * Here, we've used up all of the available space for the MRU, so we need to
2737  * evict from our own cache instead.  Evict from the set of resident MRU
2738  * entries.
2739  *
2740  * 3. Insert for MFU (c - p) > sizeof(arc_mfu) ->
2741  * c minus p represents the MFU space in the cache, since p is the size of the
2742  * cache that is dedicated to the MRU.  In this situation there's still space on
2743  * the MFU side, so the MRU side needs to be victimized.
2744  *
2745  * 4. Insert for MFU (c - p) < sizeof(arc_mfu) ->
2746  * MFU's resident set is consuming more space than it has been allotted.  In
2747  * this situation, we must victimize our own cache, the MFU, for this insertion.
2748  */
2749 static void
2750 arc_get_data_buf(arc_buf_t *buf)
2751 {
2752         arc_state_t             *state = buf->b_hdr->b_state;
2753         uint64_t                size = buf->b_hdr->b_size;
2754         arc_buf_contents_t      type = buf->b_hdr->b_type;
2755         arc_buf_contents_t      evict = ARC_BUFC_DATA;
2756         boolean_t               recycle = TRUE;
2757
2758         arc_adapt(size, state);
2759
2760         /*
2761          * We have not yet reached cache maximum size,
2762          * just allocate a new buffer.
2763          */
2764         if (!arc_evict_needed(type)) {
2765                 if (type == ARC_BUFC_METADATA) {
2766                         buf->b_data = zio_buf_alloc(size);
2767                         arc_space_consume(size, ARC_SPACE_META);
2768                 } else {
2769                         ASSERT(type == ARC_BUFC_DATA);
2770                         buf->b_data = zio_data_buf_alloc(size);
2771                         arc_space_consume(size, ARC_SPACE_DATA);
2772                 }
2773                 goto out;
2774         }
2775
2776         /*
2777          * If we are prefetching from the mfu ghost list, this buffer
2778          * will end up on the mru list; so steal space from there.
2779          */
2780         if (state == arc_mfu_ghost)
2781                 state = buf->b_hdr->b_flags & ARC_PREFETCH ? arc_mru : arc_mfu;
2782         else if (state == arc_mru_ghost)
2783                 state = arc_mru;
2784
2785         if (state == arc_mru || state == arc_anon) {
2786                 uint64_t mru_used = arc_anon->arcs_size + arc_mru->arcs_size;
2787                 state = (arc_mfu->arcs_lsize[type] >= size &&
2788                     arc_p > mru_used) ? arc_mfu : arc_mru;
2789         } else {
2790                 /* MFU cases */
2791                 uint64_t mfu_space = arc_c - arc_p;
2792                 state =  (arc_mru->arcs_lsize[type] >= size &&
2793                     mfu_space > arc_mfu->arcs_size) ? arc_mru : arc_mfu;
2794         }
2795
2796         /*
2797          * Evict data buffers prior to metadata buffers, unless we're
2798          * over the metadata limit and adding a metadata buffer.
2799          */
2800         if (type == ARC_BUFC_METADATA) {
2801                 if (arc_meta_used >= arc_meta_limit)
2802                         evict = ARC_BUFC_METADATA;
2803                 else
2804                         /*
2805                          * In this case, we're evicting data while
2806                          * adding metadata. Thus, to prevent recycling a
2807                          * data buffer into a metadata buffer, recycling
2808                          * is disabled in the following arc_evict call.
2809                          */
2810                         recycle = FALSE;
2811         }
2812
2813         if ((buf->b_data = arc_evict(state, 0, size, recycle, evict)) == NULL) {
2814                 if (type == ARC_BUFC_METADATA) {
2815                         buf->b_data = zio_buf_alloc(size);
2816                         arc_space_consume(size, ARC_SPACE_META);
2817
2818                         /*
2819                          * If we are unable to recycle an existing meta buffer
2820                          * signal the reclaim thread.  It will notify users
2821                          * via the prune callback to drop references.  The
2822                          * prune callback in run in the context of the reclaim
2823                          * thread to avoid deadlocking on the hash_lock.
2824                          * Of course, only do this when recycle is true.
2825                          */
2826                         if (recycle)
2827                                 cv_signal(&arc_reclaim_thr_cv);
2828                 } else {
2829                         ASSERT(type == ARC_BUFC_DATA);
2830                         buf->b_data = zio_data_buf_alloc(size);
2831                         arc_space_consume(size, ARC_SPACE_DATA);
2832                 }
2833
2834                 /* Only bump this if we tried to recycle and failed */
2835                 if (recycle)
2836                         ARCSTAT_BUMP(arcstat_recycle_miss);
2837         }
2838         ASSERT(buf->b_data != NULL);
2839 out:
2840         /*
2841          * Update the state size.  Note that ghost states have a
2842          * "ghost size" and so don't need to be updated.
2843          */
2844         if (!GHOST_STATE(buf->b_hdr->b_state)) {
2845                 arc_buf_hdr_t *hdr = buf->b_hdr;
2846
2847                 atomic_add_64(&hdr->b_state->arcs_size, size);
2848                 if (list_link_active(&hdr->b_arc_node)) {
2849                         ASSERT(refcount_is_zero(&hdr->b_refcnt));
2850                         atomic_add_64(&hdr->b_state->arcs_lsize[type], size);
2851                 }
2852                 /*
2853                  * If we are growing the cache, and we are adding anonymous
2854                  * data, and we have outgrown arc_p, update arc_p
2855                  */
2856                 if (!zfs_arc_p_aggressive_disable &&
2857                     arc_size < arc_c && hdr->b_state == arc_anon &&
2858                     arc_anon->arcs_size + arc_mru->arcs_size > arc_p)
2859                         arc_p = MIN(arc_c, arc_p + size);
2860         }
2861 }
2862
2863 /*
2864  * This routine is called whenever a buffer is accessed.
2865  * NOTE: the hash lock is dropped in this function.
2866  */
2867 static void
2868 arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock)
2869 {
2870         clock_t now;
2871
2872         ASSERT(MUTEX_HELD(hash_lock));
2873
2874         if (buf->b_state == arc_anon) {
2875                 /*
2876                  * This buffer is not in the cache, and does not
2877                  * appear in our "ghost" list.  Add the new buffer
2878                  * to the MRU state.
2879                  */
2880
2881                 ASSERT(buf->b_arc_access == 0);
2882                 buf->b_arc_access = ddi_get_lbolt();
2883                 DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, buf);
2884                 arc_change_state(arc_mru, buf, hash_lock);
2885
2886         } else if (buf->b_state == arc_mru) {
2887                 now = ddi_get_lbolt();
2888
2889                 /*
2890                  * If this buffer is here because of a prefetch, then either:
2891                  * - clear the flag if this is a "referencing" read
2892                  *   (any subsequent access will bump this into the MFU state).
2893                  * or
2894                  * - move the buffer to the head of the list if this is
2895                  *   another prefetch (to make it less likely to be evicted).
2896                  */
2897                 if ((buf->b_flags & ARC_PREFETCH) != 0) {
2898                         if (refcount_count(&buf->b_refcnt) == 0) {
2899                                 ASSERT(list_link_active(&buf->b_arc_node));
2900                         } else {
2901                                 buf->b_flags &= ~ARC_PREFETCH;
2902                                 atomic_inc_32(&buf->b_mru_hits);
2903                                 ARCSTAT_BUMP(arcstat_mru_hits);
2904                         }
2905                         buf->b_arc_access = now;
2906                         return;
2907                 }
2908
2909                 /*
2910                  * This buffer has been "accessed" only once so far,
2911                  * but it is still in the cache. Move it to the MFU
2912                  * state.
2913                  */
2914                 if (ddi_time_after(now, buf->b_arc_access + ARC_MINTIME)) {
2915                         /*
2916                          * More than 125ms have passed since we
2917                          * instantiated this buffer.  Move it to the
2918                          * most frequently used state.
2919                          */
2920                         buf->b_arc_access = now;
2921                         DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf);
2922                         arc_change_state(arc_mfu, buf, hash_lock);
2923                 }
2924                 atomic_inc_32(&buf->b_mru_hits);
2925                 ARCSTAT_BUMP(arcstat_mru_hits);
2926         } else if (buf->b_state == arc_mru_ghost) {
2927                 arc_state_t     *new_state;
2928                 /*
2929                  * This buffer has been "accessed" recently, but
2930                  * was evicted from the cache.  Move it to the
2931                  * MFU state.
2932                  */
2933
2934                 if (buf->b_flags & ARC_PREFETCH) {
2935                         new_state = arc_mru;
2936                         if (refcount_count(&buf->b_refcnt) > 0)
2937                                 buf->b_flags &= ~ARC_PREFETCH;
2938                         DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, buf);
2939                 } else {
2940                         new_state = arc_mfu;
2941                         DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf);
2942                 }
2943
2944                 buf->b_arc_access = ddi_get_lbolt();
2945                 arc_change_state(new_state, buf, hash_lock);
2946
2947                 atomic_inc_32(&buf->b_mru_ghost_hits);
2948                 ARCSTAT_BUMP(arcstat_mru_ghost_hits);
2949         } else if (buf->b_state == arc_mfu) {
2950                 /*
2951                  * This buffer has been accessed more than once and is
2952                  * still in the cache.  Keep it in the MFU state.
2953                  *
2954                  * NOTE: an add_reference() that occurred when we did
2955                  * the arc_read() will have kicked this off the list.
2956                  * If it was a prefetch, we will explicitly move it to
2957                  * the head of the list now.
2958                  */
2959                 if ((buf->b_flags & ARC_PREFETCH) != 0) {
2960                         ASSERT(refcount_count(&buf->b_refcnt) == 0);
2961                         ASSERT(list_link_active(&buf->b_arc_node));
2962                 }
2963                 atomic_inc_32(&buf->b_mfu_hits);
2964                 ARCSTAT_BUMP(arcstat_mfu_hits);
2965                 buf->b_arc_access = ddi_get_lbolt();
2966         } else if (buf->b_state == arc_mfu_ghost) {
2967                 arc_state_t     *new_state = arc_mfu;
2968                 /*
2969                  * This buffer has been accessed more than once but has
2970                  * been evicted from the cache.  Move it back to the
2971                  * MFU state.
2972                  */
2973
2974                 if (buf->b_flags & ARC_PREFETCH) {
2975                         /*
2976                          * This is a prefetch access...
2977                          * move this block back to the MRU state.
2978                          */
2979                         ASSERT0(refcount_count(&buf->b_refcnt));
2980                         new_state = arc_mru;
2981                 }
2982
2983                 buf->b_arc_access = ddi_get_lbolt();
2984                 DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf);
2985                 arc_change_state(new_state, buf, hash_lock);
2986
2987                 atomic_inc_32(&buf->b_mfu_ghost_hits);
2988                 ARCSTAT_BUMP(arcstat_mfu_ghost_hits);
2989         } else if (buf->b_state == arc_l2c_only) {
2990                 /*
2991                  * This buffer is on the 2nd Level ARC.
2992                  */
2993
2994                 buf->b_arc_access = ddi_get_lbolt();
2995                 DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf);
2996                 arc_change_state(arc_mfu, buf, hash_lock);
2997         } else {
2998                 cmn_err(CE_PANIC, "invalid arc state 0x%p", buf->b_state);
2999         }
3000 }
3001
3002 /* a generic arc_done_func_t which you can use */
3003 /* ARGSUSED */
3004 void
3005 arc_bcopy_func(zio_t *zio, arc_buf_t *buf, void *arg)
3006 {
3007         if (zio == NULL || zio->io_error == 0)
3008                 bcopy(buf->b_data, arg, buf->b_hdr->b_size);
3009         VERIFY(arc_buf_remove_ref(buf, arg));
3010 }
3011
3012 /* a generic arc_done_func_t */
3013 void
3014 arc_getbuf_func(zio_t *zio, arc_buf_t *buf, void *arg)
3015 {
3016         arc_buf_t **bufp = arg;
3017         if (zio && zio->io_error) {
3018                 VERIFY(arc_buf_remove_ref(buf, arg));
3019                 *bufp = NULL;
3020         } else {
3021                 *bufp = buf;
3022                 ASSERT(buf->b_data);
3023         }
3024 }
3025
3026 static void
3027 arc_read_done(zio_t *zio)
3028 {
3029         arc_buf_hdr_t   *hdr;
3030         arc_buf_t       *buf;
3031         arc_buf_t       *abuf;  /* buffer we're assigning to callback */
3032         kmutex_t        *hash_lock = NULL;
3033         arc_callback_t  *callback_list, *acb;
3034         int             freeable = FALSE;
3035
3036         buf = zio->io_private;
3037         hdr = buf->b_hdr;
3038
3039         /*
3040          * The hdr was inserted into hash-table and removed from lists
3041          * prior to starting I/O.  We should find this header, since
3042          * it's in the hash table, and it should be legit since it's
3043          * not possible to evict it during the I/O.  The only possible
3044          * reason for it not to be found is if we were freed during the
3045          * read.
3046          */
3047         if (HDR_IN_HASH_TABLE(hdr)) {
3048                 arc_buf_hdr_t *found;
3049
3050                 ASSERT3U(hdr->b_birth, ==, BP_PHYSICAL_BIRTH(zio->io_bp));
3051                 ASSERT3U(hdr->b_dva.dva_word[0], ==,
3052                     BP_IDENTITY(zio->io_bp)->dva_word[0]);
3053                 ASSERT3U(hdr->b_dva.dva_word[1], ==,
3054                     BP_IDENTITY(zio->io_bp)->dva_word[1]);
3055
3056                 found = buf_hash_find(hdr->b_spa, zio->io_bp,
3057                     &hash_lock);
3058
3059                 ASSERT((found == NULL && HDR_FREED_IN_READ(hdr) &&
3060                     hash_lock == NULL) ||
3061                     (found == hdr &&
3062                     DVA_EQUAL(&hdr->b_dva, BP_IDENTITY(zio->io_bp))) ||
3063                     (found == hdr && HDR_L2_READING(hdr)));
3064         }
3065
3066         hdr->b_flags &= ~ARC_L2_EVICTED;
3067         if (l2arc_noprefetch && (hdr->b_flags & ARC_PREFETCH))
3068                 hdr->b_flags &= ~ARC_L2CACHE;
3069
3070         /* byteswap if necessary */
3071         callback_list = hdr->b_acb;
3072         ASSERT(callback_list != NULL);
3073         if (BP_SHOULD_BYTESWAP(zio->io_bp) && zio->io_error == 0) {
3074                 dmu_object_byteswap_t bswap =
3075                     DMU_OT_BYTESWAP(BP_GET_TYPE(zio->io_bp));
3076                 if (BP_GET_LEVEL(zio->io_bp) > 0)
3077                     byteswap_uint64_array(buf->b_data, hdr->b_size);
3078                 else
3079                     dmu_ot_byteswap[bswap].ob_func(buf->b_data, hdr->b_size);
3080         }
3081
3082         arc_cksum_compute(buf, B_FALSE);
3083         arc_buf_watch(buf);
3084
3085         if (hash_lock && zio->io_error == 0 && hdr->b_state == arc_anon) {
3086                 /*
3087                  * Only call arc_access on anonymous buffers.  This is because
3088                  * if we've issued an I/O for an evicted buffer, we've already
3089                  * called arc_access (to prevent any simultaneous readers from
3090                  * getting confused).
3091                  */
3092                 arc_access(hdr, hash_lock);
3093         }
3094
3095         /* create copies of the data buffer for the callers */
3096         abuf = buf;
3097         for (acb = callback_list; acb; acb = acb->acb_next) {
3098                 if (acb->acb_done) {
3099                         if (abuf == NULL) {
3100                                 ARCSTAT_BUMP(arcstat_duplicate_reads);
3101                                 abuf = arc_buf_clone(buf);
3102                         }
3103                         acb->acb_buf = abuf;
3104                         abuf = NULL;
3105                 }
3106         }
3107         hdr->b_acb = NULL;
3108         hdr->b_flags &= ~ARC_IO_IN_PROGRESS;
3109         ASSERT(!HDR_BUF_AVAILABLE(hdr));
3110         if (abuf == buf) {
3111                 ASSERT(buf->b_efunc == NULL);
3112                 ASSERT(hdr->b_datacnt == 1);
3113                 hdr->b_flags |= ARC_BUF_AVAILABLE;
3114         }
3115
3116         ASSERT(refcount_is_zero(&hdr->b_refcnt) || callback_list != NULL);
3117
3118         if (zio->io_error != 0) {
3119                 hdr->b_flags |= ARC_IO_ERROR;
3120                 if (hdr->b_state != arc_anon)
3121                         arc_change_state(arc_anon, hdr, hash_lock);
3122                 if (HDR_IN_HASH_TABLE(hdr))
3123                         buf_hash_remove(hdr);
3124                 freeable = refcount_is_zero(&hdr->b_refcnt);
3125         }
3126
3127         /*
3128          * Broadcast before we drop the hash_lock to avoid the possibility
3129          * that the hdr (and hence the cv) might be freed before we get to
3130          * the cv_broadcast().
3131          */
3132         cv_broadcast(&hdr->b_cv);
3133
3134         if (hash_lock) {
3135                 mutex_exit(hash_lock);
3136         } else {
3137                 /*
3138                  * This block was freed while we waited for the read to
3139                  * complete.  It has been removed from the hash table and
3140                  * moved to the anonymous state (so that it won't show up
3141                  * in the cache).
3142                  */
3143                 ASSERT3P(hdr->b_state, ==, arc_anon);
3144                 freeable = refcount_is_zero(&hdr->b_refcnt);
3145         }
3146
3147         /* execute each callback and free its structure */
3148         while ((acb = callback_list) != NULL) {
3149                 if (acb->acb_done)
3150                         acb->acb_done(zio, acb->acb_buf, acb->acb_private);
3151
3152                 if (acb->acb_zio_dummy != NULL) {
3153                         acb->acb_zio_dummy->io_error = zio->io_error;
3154                         zio_nowait(acb->acb_zio_dummy);
3155                 }
3156
3157                 callback_list = acb->acb_next;
3158                 kmem_free(acb, sizeof (arc_callback_t));
3159         }
3160
3161         if (freeable)
3162                 arc_hdr_destroy(hdr);
3163 }
3164
3165 /*
3166  * "Read" the block at the specified DVA (in bp) via the
3167  * cache.  If the block is found in the cache, invoke the provided
3168  * callback immediately and return.  Note that the `zio' parameter
3169  * in the callback will be NULL in this case, since no IO was
3170  * required.  If the block is not in the cache pass the read request
3171  * on to the spa with a substitute callback function, so that the
3172  * requested block will be added to the cache.
3173  *
3174  * If a read request arrives for a block that has a read in-progress,
3175  * either wait for the in-progress read to complete (and return the
3176  * results); or, if this is a read with a "done" func, add a record
3177  * to the read to invoke the "done" func when the read completes,
3178  * and return; or just return.
3179  *
3180  * arc_read_done() will invoke all the requested "done" functions
3181  * for readers of this block.
3182  */
3183 int
3184 arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_done_func_t *done,
3185     void *private, zio_priority_t priority, int zio_flags, uint32_t *arc_flags,
3186     const zbookmark_phys_t *zb)
3187 {
3188         arc_buf_hdr_t *hdr = NULL;
3189         arc_buf_t *buf = NULL;
3190         kmutex_t *hash_lock = NULL;
3191         zio_t *rzio;
3192         uint64_t guid = spa_load_guid(spa);
3193         int rc = 0;
3194
3195         ASSERT(!BP_IS_EMBEDDED(bp) ||
3196             BPE_GET_ETYPE(bp) == BP_EMBEDDED_TYPE_DATA);
3197
3198 top:
3199         if (!BP_IS_EMBEDDED(bp)) {
3200                 /*
3201                  * Embedded BP's have no DVA and require no I/O to "read".
3202                  * Create an anonymous arc buf to back it.
3203                  */
3204                 hdr = buf_hash_find(guid, bp, &hash_lock);
3205         }
3206
3207         if (hdr != NULL && hdr->b_datacnt > 0) {
3208
3209                 *arc_flags |= ARC_CACHED;
3210
3211                 if (HDR_IO_IN_PROGRESS(hdr)) {
3212
3213                         if (*arc_flags & ARC_WAIT) {
3214                                 cv_wait(&hdr->b_cv, hash_lock);
3215                                 mutex_exit(hash_lock);
3216                                 goto top;
3217                         }
3218                         ASSERT(*arc_flags & ARC_NOWAIT);
3219
3220                         if (done) {
3221                                 arc_callback_t  *acb = NULL;
3222
3223                                 acb = kmem_zalloc(sizeof (arc_callback_t),
3224                                     KM_SLEEP);
3225                                 acb->acb_done = done;
3226                                 acb->acb_private = private;
3227                                 if (pio != NULL)
3228                                         acb->acb_zio_dummy = zio_null(pio,
3229                                             spa, NULL, NULL, NULL, zio_flags);
3230
3231                                 ASSERT(acb->acb_done != NULL);
3232                                 acb->acb_next = hdr->b_acb;
3233                                 hdr->b_acb = acb;
3234                                 add_reference(hdr, hash_lock, private);
3235                                 mutex_exit(hash_lock);
3236                                 goto out;
3237                         }
3238                         mutex_exit(hash_lock);
3239                         goto out;
3240                 }
3241
3242                 ASSERT(hdr->b_state == arc_mru || hdr->b_state == arc_mfu);
3243
3244                 if (done) {
3245                         add_reference(hdr, hash_lock, private);
3246                         /*
3247                          * If this block is already in use, create a new
3248                          * copy of the data so that we will be guaranteed
3249                          * that arc_release() will always succeed.
3250                          */
3251                         buf = hdr->b_buf;
3252                         ASSERT(buf);
3253                         ASSERT(buf->b_data);
3254                         if (HDR_BUF_AVAILABLE(hdr)) {
3255                                 ASSERT(buf->b_efunc == NULL);
3256                                 hdr->b_flags &= ~ARC_BUF_AVAILABLE;
3257                         } else {
3258                                 buf = arc_buf_clone(buf);
3259                         }
3260
3261                 } else if (*arc_flags & ARC_PREFETCH &&
3262                     refcount_count(&hdr->b_refcnt) == 0) {
3263                         hdr->b_flags |= ARC_PREFETCH;
3264                 }
3265                 DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr);
3266                 arc_access(hdr, hash_lock);
3267                 if (*arc_flags & ARC_L2CACHE)
3268                         hdr->b_flags |= ARC_L2CACHE;
3269                 if (*arc_flags & ARC_L2COMPRESS)
3270                         hdr->b_flags |= ARC_L2COMPRESS;
3271                 mutex_exit(hash_lock);
3272                 ARCSTAT_BUMP(arcstat_hits);
3273                 ARCSTAT_CONDSTAT(!(hdr->b_flags & ARC_PREFETCH),
3274                     demand, prefetch, hdr->b_type != ARC_BUFC_METADATA,
3275                     data, metadata, hits);
3276
3277                 if (done)
3278                         done(NULL, buf, private);
3279         } else {
3280                 uint64_t size = BP_GET_LSIZE(bp);
3281                 arc_callback_t *acb;
3282                 vdev_t *vd = NULL;
3283                 uint64_t addr = 0;
3284                 boolean_t devw = B_FALSE;
3285                 enum zio_compress b_compress = ZIO_COMPRESS_OFF;
3286                 uint64_t b_asize = 0;
3287
3288                 /*
3289                  * Gracefully handle a damaged logical block size as a
3290                  * checksum error by passing a dummy zio to the done callback.
3291                  */
3292                 if (size > SPA_MAXBLOCKSIZE) {
3293                         if (done) {
3294                                 rzio = zio_null(pio, spa, NULL,
3295                                     NULL, NULL, zio_flags);
3296                                 rzio->io_error = ECKSUM;
3297                                 done(rzio, buf, private);
3298                                 zio_nowait(rzio);
3299                         }
3300                         rc = ECKSUM;
3301                         goto out;
3302                 }
3303
3304                 if (hdr == NULL) {
3305                         /* this block is not in the cache */
3306                         arc_buf_hdr_t *exists = NULL;
3307                         arc_buf_contents_t type = BP_GET_BUFC_TYPE(bp);
3308                         buf = arc_buf_alloc(spa, size, private, type);
3309                         hdr = buf->b_hdr;
3310                         if (!BP_IS_EMBEDDED(bp)) {
3311                                 hdr->b_dva = *BP_IDENTITY(bp);
3312                                 hdr->b_birth = BP_PHYSICAL_BIRTH(bp);
3313                                 hdr->b_cksum0 = bp->blk_cksum.zc_word[0];
3314                                 exists = buf_hash_insert(hdr, &hash_lock);
3315                         }
3316                         if (exists != NULL) {
3317                                 /* somebody beat us to the hash insert */
3318                                 mutex_exit(hash_lock);
3319                                 buf_discard_identity(hdr);
3320                                 (void) arc_buf_remove_ref(buf, private);
3321                                 goto top; /* restart the IO request */
3322                         }
3323                         /* if this is a prefetch, we don't have a reference */
3324                         if (*arc_flags & ARC_PREFETCH) {
3325                                 (void) remove_reference(hdr, hash_lock,
3326                                     private);
3327                                 hdr->b_flags |= ARC_PREFETCH;
3328                         }
3329                         if (*arc_flags & ARC_L2CACHE)
3330                                 hdr->b_flags |= ARC_L2CACHE;
3331                         if (*arc_flags & ARC_L2COMPRESS)
3332                                 hdr->b_flags |= ARC_L2COMPRESS;
3333                         if (BP_GET_LEVEL(bp) > 0)
3334                                 hdr->b_flags |= ARC_INDIRECT;
3335                 } else {
3336                         /* this block is in the ghost cache */
3337                         ASSERT(GHOST_STATE(hdr->b_state));
3338                         ASSERT(!HDR_IO_IN_PROGRESS(hdr));
3339                         ASSERT0(refcount_count(&hdr->b_refcnt));
3340                         ASSERT(hdr->b_buf == NULL);
3341
3342                         /* if this is a prefetch, we don't have a reference */
3343                         if (*arc_flags & ARC_PREFETCH)
3344                                 hdr->b_flags |= ARC_PREFETCH;
3345                         else
3346                                 add_reference(hdr, hash_lock, private);
3347                         if (*arc_flags & ARC_L2CACHE)
3348                                 hdr->b_flags |= ARC_L2CACHE;
3349                         if (*arc_flags & ARC_L2COMPRESS)
3350                                 hdr->b_flags |= ARC_L2COMPRESS;
3351                         buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE);
3352                         buf->b_hdr = hdr;
3353                         buf->b_data = NULL;
3354                         buf->b_efunc = NULL;
3355                         buf->b_private = NULL;
3356                         buf->b_next = NULL;
3357                         hdr->b_buf = buf;
3358                         ASSERT(hdr->b_datacnt == 0);
3359                         hdr->b_datacnt = 1;
3360                         arc_get_data_buf(buf);
3361                         arc_access(hdr, hash_lock);
3362                 }
3363
3364                 ASSERT(!GHOST_STATE(hdr->b_state));
3365
3366                 acb = kmem_zalloc(sizeof (arc_callback_t), KM_SLEEP);
3367                 acb->acb_done = done;
3368                 acb->acb_private = private;
3369
3370                 ASSERT(hdr->b_acb == NULL);
3371                 hdr->b_acb = acb;
3372                 hdr->b_flags |= ARC_IO_IN_PROGRESS;
3373
3374                 if (hdr->b_l2hdr != NULL &&
3375                     (vd = hdr->b_l2hdr->b_dev->l2ad_vdev) != NULL) {
3376                         devw = hdr->b_l2hdr->b_dev->l2ad_writing;
3377                         addr = hdr->b_l2hdr->b_daddr;
3378                         b_compress = hdr->b_l2hdr->b_compress;
3379                         b_asize = hdr->b_l2hdr->b_asize;
3380                         /*
3381                          * Lock out device removal.
3382                          */
3383                         if (vdev_is_dead(vd) ||
3384                             !spa_config_tryenter(spa, SCL_L2ARC, vd, RW_READER))
3385                                 vd = NULL;
3386                 }
3387
3388                 if (hash_lock != NULL)
3389                         mutex_exit(hash_lock);
3390
3391                 /*
3392                  * At this point, we have a level 1 cache miss.  Try again in
3393                  * L2ARC if possible.
3394                  */
3395                 ASSERT3U(hdr->b_size, ==, size);
3396                 DTRACE_PROBE4(arc__miss, arc_buf_hdr_t *, hdr, blkptr_t *, bp,
3397                     uint64_t, size, zbookmark_phys_t *, zb);
3398                 ARCSTAT_BUMP(arcstat_misses);
3399                 ARCSTAT_CONDSTAT(!(hdr->b_flags & ARC_PREFETCH),
3400                     demand, prefetch, hdr->b_type != ARC_BUFC_METADATA,
3401                     data, metadata, misses);
3402
3403                 if (vd != NULL && l2arc_ndev != 0 && !(l2arc_norw && devw)) {
3404                         /*
3405                          * Read from the L2ARC if the following are true:
3406                          * 1. The L2ARC vdev was previously cached.
3407                          * 2. This buffer still has L2ARC metadata.
3408                          * 3. This buffer isn't currently writing to the L2ARC.
3409                          * 4. The L2ARC entry wasn't evicted, which may
3410                          *    also have invalidated the vdev.
3411                          * 5. This isn't prefetch and l2arc_noprefetch is set.
3412                          */
3413                         if (hdr->b_l2hdr != NULL &&
3414                             !HDR_L2_WRITING(hdr) && !HDR_L2_EVICTED(hdr) &&
3415                             !(l2arc_noprefetch && HDR_PREFETCH(hdr))) {
3416                                 l2arc_read_callback_t *cb;
3417
3418                                 DTRACE_PROBE1(l2arc__hit, arc_buf_hdr_t *, hdr);
3419                                 ARCSTAT_BUMP(arcstat_l2_hits);
3420                                 atomic_inc_32(&hdr->b_l2hdr->b_hits);
3421
3422                                 cb = kmem_zalloc(sizeof (l2arc_read_callback_t),
3423                                     KM_SLEEP);
3424                                 cb->l2rcb_buf = buf;
3425                                 cb->l2rcb_spa = spa;
3426                                 cb->l2rcb_bp = *bp;
3427                                 cb->l2rcb_zb = *zb;
3428                                 cb->l2rcb_flags = zio_flags;
3429                                 cb->l2rcb_compress = b_compress;
3430
3431                                 ASSERT(addr >= VDEV_LABEL_START_SIZE &&
3432                                     addr + size < vd->vdev_psize -
3433                                     VDEV_LABEL_END_SIZE);
3434
3435                                 /*
3436                                  * l2arc read.  The SCL_L2ARC lock will be
3437                                  * released by l2arc_read_done().
3438                                  * Issue a null zio if the underlying buffer
3439                                  * was squashed to zero size by compression.
3440                                  */
3441                                 if (b_compress == ZIO_COMPRESS_EMPTY) {
3442                                         rzio = zio_null(pio, spa, vd,
3443                                             l2arc_read_done, cb,
3444                                             zio_flags | ZIO_FLAG_DONT_CACHE |
3445                                             ZIO_FLAG_CANFAIL |
3446                                             ZIO_FLAG_DONT_PROPAGATE |
3447                                             ZIO_FLAG_DONT_RETRY);
3448                                 } else {
3449                                         rzio = zio_read_phys(pio, vd, addr,
3450                                             b_asize, buf->b_data,
3451                                             ZIO_CHECKSUM_OFF,
3452                                             l2arc_read_done, cb, priority,
3453                                             zio_flags | ZIO_FLAG_DONT_CACHE |
3454                                             ZIO_FLAG_CANFAIL |
3455                                             ZIO_FLAG_DONT_PROPAGATE |
3456                                             ZIO_FLAG_DONT_RETRY, B_FALSE);
3457                                 }
3458                                 DTRACE_PROBE2(l2arc__read, vdev_t *, vd,
3459                                     zio_t *, rzio);
3460                                 ARCSTAT_INCR(arcstat_l2_read_bytes, b_asize);
3461
3462                                 if (*arc_flags & ARC_NOWAIT) {
3463                                         zio_nowait(rzio);
3464                                         goto out;
3465                                 }
3466
3467                                 ASSERT(*arc_flags & ARC_WAIT);
3468                                 if (zio_wait(rzio) == 0)
3469                                         goto out;
3470
3471                                 /* l2arc read error; goto zio_read() */
3472                         } else {
3473                                 DTRACE_PROBE1(l2arc__miss,
3474                                     arc_buf_hdr_t *, hdr);
3475                                 ARCSTAT_BUMP(arcstat_l2_misses);
3476                                 if (HDR_L2_WRITING(hdr))
3477                                         ARCSTAT_BUMP(arcstat_l2_rw_clash);
3478                                 spa_config_exit(spa, SCL_L2ARC, vd);
3479                         }
3480                 } else {
3481                         if (vd != NULL)
3482                                 spa_config_exit(spa, SCL_L2ARC, vd);
3483                         if (l2arc_ndev != 0) {
3484                                 DTRACE_PROBE1(l2arc__miss,
3485                                     arc_buf_hdr_t *, hdr);
3486                                 ARCSTAT_BUMP(arcstat_l2_misses);
3487                         }
3488                 }
3489
3490                 rzio = zio_read(pio, spa, bp, buf->b_data, size,
3491                     arc_read_done, buf, priority, zio_flags, zb);
3492
3493                 if (*arc_flags & ARC_WAIT) {
3494                         rc = zio_wait(rzio);
3495                         goto out;
3496                 }
3497
3498                 ASSERT(*arc_flags & ARC_NOWAIT);
3499                 zio_nowait(rzio);
3500         }
3501
3502 out:
3503         spa_read_history_add(spa, zb, *arc_flags);
3504         return (rc);
3505 }
3506
3507 arc_prune_t *
3508 arc_add_prune_callback(arc_prune_func_t *func, void *private)
3509 {
3510         arc_prune_t *p;
3511
3512         p = kmem_alloc(sizeof (*p), KM_SLEEP);
3513         p->p_pfunc = func;
3514         p->p_private = private;
3515         list_link_init(&p->p_node);
3516         refcount_create(&p->p_refcnt);
3517
3518         mutex_enter(&arc_prune_mtx);
3519         refcount_add(&p->p_refcnt, &arc_prune_list);
3520         list_insert_head(&arc_prune_list, p);
3521         mutex_exit(&arc_prune_mtx);
3522
3523         return (p);
3524 }
3525
3526 void
3527 arc_remove_prune_callback(arc_prune_t *p)
3528 {
3529         mutex_enter(&arc_prune_mtx);
3530         list_remove(&arc_prune_list, p);
3531         if (refcount_remove(&p->p_refcnt, &arc_prune_list) == 0) {
3532                 refcount_destroy(&p->p_refcnt);
3533                 kmem_free(p, sizeof (*p));
3534         }
3535         mutex_exit(&arc_prune_mtx);
3536 }
3537
3538 void
3539 arc_set_callback(arc_buf_t *buf, arc_evict_func_t *func, void *private)
3540 {
3541         ASSERT(buf->b_hdr != NULL);
3542         ASSERT(buf->b_hdr->b_state != arc_anon);
3543         ASSERT(!refcount_is_zero(&buf->b_hdr->b_refcnt) || func == NULL);
3544         ASSERT(buf->b_efunc == NULL);
3545         ASSERT(!HDR_BUF_AVAILABLE(buf->b_hdr));
3546
3547         buf->b_efunc = func;
3548         buf->b_private = private;
3549 }
3550
3551 /*
3552  * Notify the arc that a block was freed, and thus will never be used again.
3553  */
3554 void
3555 arc_freed(spa_t *spa, const blkptr_t *bp)
3556 {
3557         arc_buf_hdr_t *hdr;
3558         kmutex_t *hash_lock;
3559         uint64_t guid = spa_load_guid(spa);
3560
3561         ASSERT(!BP_IS_EMBEDDED(bp));
3562
3563         hdr = buf_hash_find(guid, bp, &hash_lock);
3564         if (hdr == NULL)
3565                 return;
3566         if (HDR_BUF_AVAILABLE(hdr)) {
3567                 arc_buf_t *buf = hdr->b_buf;
3568                 add_reference(hdr, hash_lock, FTAG);
3569                 hdr->b_flags &= ~ARC_BUF_AVAILABLE;
3570                 mutex_exit(hash_lock);
3571
3572                 arc_release(buf, FTAG);
3573                 (void) arc_buf_remove_ref(buf, FTAG);
3574         } else {
3575                 mutex_exit(hash_lock);
3576         }
3577
3578 }
3579
3580 /*
3581  * Clear the user eviction callback set by arc_set_callback(), first calling
3582  * it if it exists.  Because the presence of a callback keeps an arc_buf cached
3583  * clearing the callback may result in the arc_buf being destroyed.  However,
3584  * it will not result in the *last* arc_buf being destroyed, hence the data
3585  * will remain cached in the ARC. We make a copy of the arc buffer here so
3586  * that we can process the callback without holding any locks.
3587  *
3588  * It's possible that the callback is already in the process of being cleared
3589  * by another thread.  In this case we can not clear the callback.
3590  *
3591  * Returns B_TRUE if the callback was successfully called and cleared.
3592  */
3593 boolean_t
3594 arc_clear_callback(arc_buf_t *buf)
3595 {
3596         arc_buf_hdr_t *hdr;
3597         kmutex_t *hash_lock;
3598         arc_evict_func_t *efunc = buf->b_efunc;
3599         void *private = buf->b_private;
3600
3601         mutex_enter(&buf->b_evict_lock);
3602         hdr = buf->b_hdr;
3603         if (hdr == NULL) {
3604                 /*
3605                  * We are in arc_do_user_evicts().
3606                  */
3607                 ASSERT(buf->b_data == NULL);
3608                 mutex_exit(&buf->b_evict_lock);
3609                 return (B_FALSE);
3610         } else if (buf->b_data == NULL) {
3611                 /*
3612                  * We are on the eviction list; process this buffer now
3613                  * but let arc_do_user_evicts() do the reaping.
3614                  */
3615                 buf->b_efunc = NULL;
3616                 mutex_exit(&buf->b_evict_lock);
3617                 VERIFY0(efunc(private));
3618                 return (B_TRUE);
3619         }
3620         hash_lock = HDR_LOCK(hdr);
3621         mutex_enter(hash_lock);
3622         hdr = buf->b_hdr;
3623         ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
3624
3625         ASSERT3U(refcount_count(&hdr->b_refcnt), <, hdr->b_datacnt);
3626         ASSERT(hdr->b_state == arc_mru || hdr->b_state == arc_mfu);
3627
3628         buf->b_efunc = NULL;
3629         buf->b_private = NULL;
3630
3631         if (hdr->b_datacnt > 1) {
3632                 mutex_exit(&buf->b_evict_lock);
3633                 arc_buf_destroy(buf, FALSE, TRUE);
3634         } else {
3635                 ASSERT(buf == hdr->b_buf);
3636                 hdr->b_flags |= ARC_BUF_AVAILABLE;
3637                 mutex_exit(&buf->b_evict_lock);
3638         }
3639
3640         mutex_exit(hash_lock);
3641         VERIFY0(efunc(private));
3642         return (B_TRUE);
3643 }
3644
3645 /*
3646  * Release this buffer from the cache, making it an anonymous buffer.  This
3647  * must be done after a read and prior to modifying the buffer contents.
3648  * If the buffer has more than one reference, we must make
3649  * a new hdr for the buffer.
3650  */
3651 void
3652 arc_release(arc_buf_t *buf, void *tag)
3653 {
3654         arc_buf_hdr_t *hdr;
3655         kmutex_t *hash_lock = NULL;
3656         l2arc_buf_hdr_t *l2hdr;
3657         uint64_t buf_size = 0;
3658
3659         /*
3660          * It would be nice to assert that if it's DMU metadata (level >
3661          * 0 || it's the dnode file), then it must be syncing context.
3662          * But we don't know that information at this level.
3663          */
3664
3665         mutex_enter(&buf->b_evict_lock);
3666         hdr = buf->b_hdr;
3667
3668         /* this buffer is not on any list */
3669         ASSERT(refcount_count(&hdr->b_refcnt) > 0);
3670
3671         if (hdr->b_state == arc_anon) {
3672                 /* this buffer is already released */
3673                 ASSERT(buf->b_efunc == NULL);
3674         } else {
3675                 hash_lock = HDR_LOCK(hdr);
3676                 mutex_enter(hash_lock);
3677                 hdr = buf->b_hdr;
3678                 ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
3679         }
3680
3681         l2hdr = hdr->b_l2hdr;
3682         if (l2hdr) {
3683                 mutex_enter(&l2arc_buflist_mtx);
3684                 arc_buf_l2_cdata_free(hdr);
3685                 hdr->b_l2hdr = NULL;
3686                 list_remove(l2hdr->b_dev->l2ad_buflist, hdr);
3687         }
3688         buf_size = hdr->b_size;
3689
3690         /*
3691          * Do we have more than one buf?
3692          */
3693         if (hdr->b_datacnt > 1) {
3694                 arc_buf_hdr_t *nhdr;
3695                 arc_buf_t **bufp;
3696                 uint64_t blksz = hdr->b_size;
3697                 uint64_t spa = hdr->b_spa;
3698                 arc_buf_contents_t type = hdr->b_type;
3699                 uint32_t flags = hdr->b_flags;
3700
3701                 ASSERT(hdr->b_buf != buf || buf->b_next != NULL);
3702                 /*
3703                  * Pull the data off of this hdr and attach it to
3704                  * a new anonymous hdr.
3705                  */
3706                 (void) remove_reference(hdr, hash_lock, tag);
3707                 bufp = &hdr->b_buf;
3708                 while (*bufp != buf)
3709                         bufp = &(*bufp)->b_next;
3710                 *bufp = buf->b_next;
3711                 buf->b_next = NULL;
3712
3713                 ASSERT3U(hdr->b_state->arcs_size, >=, hdr->b_size);
3714                 atomic_add_64(&hdr->b_state->arcs_size, -hdr->b_size);
3715                 if (refcount_is_zero(&hdr->b_refcnt)) {
3716                         uint64_t *size = &hdr->b_state->arcs_lsize[hdr->b_type];
3717                         ASSERT3U(*size, >=, hdr->b_size);
3718                         atomic_add_64(size, -hdr->b_size);
3719                 }
3720
3721                 /*
3722                  * We're releasing a duplicate user data buffer, update
3723                  * our statistics accordingly.
3724                  */
3725                 if (hdr->b_type == ARC_BUFC_DATA) {
3726                         ARCSTAT_BUMPDOWN(arcstat_duplicate_buffers);
3727                         ARCSTAT_INCR(arcstat_duplicate_buffers_size,
3728                             -hdr->b_size);
3729                 }
3730                 hdr->b_datacnt -= 1;
3731                 arc_cksum_verify(buf);
3732                 arc_buf_unwatch(buf);
3733
3734                 mutex_exit(hash_lock);
3735
3736                 nhdr = kmem_cache_alloc(hdr_cache, KM_PUSHPAGE);
3737                 nhdr->b_size = blksz;
3738                 nhdr->b_spa = spa;
3739                 nhdr->b_type = type;
3740                 nhdr->b_buf = buf;
3741                 nhdr->b_state = arc_anon;
3742                 nhdr->b_arc_access = 0;
3743                 nhdr->b_mru_hits = 0;
3744                 nhdr->b_mru_ghost_hits = 0;
3745                 nhdr->b_mfu_hits = 0;
3746                 nhdr->b_mfu_ghost_hits = 0;
3747                 nhdr->b_l2_hits = 0;
3748                 nhdr->b_flags = flags & ARC_L2_WRITING;
3749                 nhdr->b_l2hdr = NULL;
3750                 nhdr->b_datacnt = 1;
3751                 nhdr->b_freeze_cksum = NULL;
3752                 (void) refcount_add(&nhdr->b_refcnt, tag);
3753                 buf->b_hdr = nhdr;
3754                 mutex_exit(&buf->b_evict_lock);
3755                 atomic_add_64(&arc_anon->arcs_size, blksz);
3756         } else {
3757                 mutex_exit(&buf->b_evict_lock);
3758                 ASSERT(refcount_count(&hdr->b_refcnt) == 1);
3759                 ASSERT(!list_link_active(&hdr->b_arc_node));
3760                 ASSERT(!HDR_IO_IN_PROGRESS(hdr));
3761                 if (hdr->b_state != arc_anon)
3762                         arc_change_state(arc_anon, hdr, hash_lock);
3763                 hdr->b_arc_access = 0;
3764                 hdr->b_mru_hits = 0;
3765                 hdr->b_mru_ghost_hits = 0;
3766                 hdr->b_mfu_hits = 0;
3767                 hdr->b_mfu_ghost_hits = 0;
3768                 hdr->b_l2_hits = 0;
3769                 if (hash_lock)
3770                         mutex_exit(hash_lock);
3771
3772                 buf_discard_identity(hdr);
3773                 arc_buf_thaw(buf);
3774         }
3775         buf->b_efunc = NULL;
3776         buf->b_private = NULL;
3777
3778         if (l2hdr) {
3779                 ARCSTAT_INCR(arcstat_l2_asize, -l2hdr->b_asize);
3780                 vdev_space_update(l2hdr->b_dev->l2ad_vdev,
3781                     -l2hdr->b_asize, 0, 0);
3782                 kmem_cache_free(l2arc_hdr_cache, l2hdr);
3783                 arc_space_return(L2HDR_SIZE, ARC_SPACE_L2HDRS);
3784                 ARCSTAT_INCR(arcstat_l2_size, -buf_size);
3785                 mutex_exit(&l2arc_buflist_mtx);
3786         }
3787 }
3788
3789 int
3790 arc_released(arc_buf_t *buf)
3791 {
3792         int released;
3793
3794         mutex_enter(&buf->b_evict_lock);
3795         released = (buf->b_data != NULL && buf->b_hdr->b_state == arc_anon);
3796         mutex_exit(&buf->b_evict_lock);
3797         return (released);
3798 }
3799
3800 #ifdef ZFS_DEBUG
3801 int
3802 arc_referenced(arc_buf_t *buf)
3803 {
3804         int referenced;
3805
3806         mutex_enter(&buf->b_evict_lock);
3807         referenced = (refcount_count(&buf->b_hdr->b_refcnt));
3808         mutex_exit(&buf->b_evict_lock);
3809         return (referenced);
3810 }
3811 #endif
3812
3813 static void
3814 arc_write_ready(zio_t *zio)
3815 {
3816         arc_write_callback_t *callback = zio->io_private;
3817         arc_buf_t *buf = callback->awcb_buf;
3818         arc_buf_hdr_t *hdr = buf->b_hdr;
3819
3820         ASSERT(!refcount_is_zero(&buf->b_hdr->b_refcnt));
3821         callback->awcb_ready(zio, buf, callback->awcb_private);
3822
3823         /*
3824          * If the IO is already in progress, then this is a re-write
3825          * attempt, so we need to thaw and re-compute the cksum.
3826          * It is the responsibility of the callback to handle the
3827          * accounting for any re-write attempt.
3828          */
3829         if (HDR_IO_IN_PROGRESS(hdr)) {
3830                 mutex_enter(&hdr->b_freeze_lock);
3831                 if (hdr->b_freeze_cksum != NULL) {
3832                         kmem_free(hdr->b_freeze_cksum, sizeof (zio_cksum_t));
3833                         hdr->b_freeze_cksum = NULL;
3834                 }
3835                 mutex_exit(&hdr->b_freeze_lock);
3836         }
3837         arc_cksum_compute(buf, B_FALSE);
3838         hdr->b_flags |= ARC_IO_IN_PROGRESS;
3839 }
3840
3841 /*
3842  * The SPA calls this callback for each physical write that happens on behalf
3843  * of a logical write.  See the comment in dbuf_write_physdone() for details.
3844  */
3845 static void
3846 arc_write_physdone(zio_t *zio)
3847 {
3848         arc_write_callback_t *cb = zio->io_private;
3849         if (cb->awcb_physdone != NULL)
3850                 cb->awcb_physdone(zio, cb->awcb_buf, cb->awcb_private);
3851 }
3852
3853 static void
3854 arc_write_done(zio_t *zio)
3855 {
3856         arc_write_callback_t *callback = zio->io_private;
3857         arc_buf_t *buf = callback->awcb_buf;
3858         arc_buf_hdr_t *hdr = buf->b_hdr;
3859
3860         ASSERT(hdr->b_acb == NULL);
3861
3862         if (zio->io_error == 0) {
3863                 if (BP_IS_HOLE(zio->io_bp) || BP_IS_EMBEDDED(zio->io_bp)) {
3864                         buf_discard_identity(hdr);
3865                 } else {
3866                         hdr->b_dva = *BP_IDENTITY(zio->io_bp);
3867                         hdr->b_birth = BP_PHYSICAL_BIRTH(zio->io_bp);
3868                         hdr->b_cksum0 = zio->io_bp->blk_cksum.zc_word[0];
3869                 }
3870         } else {
3871                 ASSERT(BUF_EMPTY(hdr));
3872         }
3873
3874         /*
3875          * If the block to be written was all-zero or compressed enough to be
3876          * embedded in the BP, no write was performed so there will be no
3877          * dva/birth/checksum.  The buffer must therefore remain anonymous
3878          * (and uncached).
3879          */
3880         if (!BUF_EMPTY(hdr)) {
3881                 arc_buf_hdr_t *exists;
3882                 kmutex_t *hash_lock;
3883
3884                 ASSERT(zio->io_error == 0);
3885
3886                 arc_cksum_verify(buf);
3887
3888                 exists = buf_hash_insert(hdr, &hash_lock);
3889                 if (exists) {
3890                         /*
3891                          * This can only happen if we overwrite for
3892                          * sync-to-convergence, because we remove
3893                          * buffers from the hash table when we arc_free().
3894                          */
3895                         if (zio->io_flags & ZIO_FLAG_IO_REWRITE) {
3896                                 if (!BP_EQUAL(&zio->io_bp_orig, zio->io_bp))
3897                                         panic("bad overwrite, hdr=%p exists=%p",
3898                                             (void *)hdr, (void *)exists);
3899                                 ASSERT(refcount_is_zero(&exists->b_refcnt));
3900                                 arc_change_state(arc_anon, exists, hash_lock);
3901                                 mutex_exit(hash_lock);
3902                                 arc_hdr_destroy(exists);
3903                                 exists = buf_hash_insert(hdr, &hash_lock);
3904                                 ASSERT3P(exists, ==, NULL);
3905                         } else if (zio->io_flags & ZIO_FLAG_NOPWRITE) {
3906                                 /* nopwrite */
3907                                 ASSERT(zio->io_prop.zp_nopwrite);
3908                                 if (!BP_EQUAL(&zio->io_bp_orig, zio->io_bp))
3909                                         panic("bad nopwrite, hdr=%p exists=%p",
3910                                             (void *)hdr, (void *)exists);
3911                         } else {
3912                                 /* Dedup */
3913                                 ASSERT(hdr->b_datacnt == 1);
3914                                 ASSERT(hdr->b_state == arc_anon);
3915                                 ASSERT(BP_GET_DEDUP(zio->io_bp));
3916                                 ASSERT(BP_GET_LEVEL(zio->io_bp) == 0);
3917                         }
3918                 }
3919                 hdr->b_flags &= ~ARC_IO_IN_PROGRESS;
3920                 /* if it's not anon, we are doing a scrub */
3921                 if (!exists && hdr->b_state == arc_anon)
3922                         arc_access(hdr, hash_lock);
3923                 mutex_exit(hash_lock);
3924         } else {
3925                 hdr->b_flags &= ~ARC_IO_IN_PROGRESS;
3926         }
3927
3928         ASSERT(!refcount_is_zero(&hdr->b_refcnt));
3929         callback->awcb_done(zio, buf, callback->awcb_private);
3930
3931         kmem_free(callback, sizeof (arc_write_callback_t));
3932 }
3933
3934 zio_t *
3935 arc_write(zio_t *pio, spa_t *spa, uint64_t txg,
3936     blkptr_t *bp, arc_buf_t *buf, boolean_t l2arc, boolean_t l2arc_compress,
3937     const zio_prop_t *zp, arc_done_func_t *ready, arc_done_func_t *physdone,
3938     arc_done_func_t *done, void *private, zio_priority_t priority,
3939     int zio_flags, const zbookmark_phys_t *zb)
3940 {
3941         arc_buf_hdr_t *hdr = buf->b_hdr;
3942         arc_write_callback_t *callback;
3943         zio_t *zio;
3944
3945         ASSERT(ready != NULL);
3946         ASSERT(done != NULL);
3947         ASSERT(!HDR_IO_ERROR(hdr));
3948         ASSERT((hdr->b_flags & ARC_IO_IN_PROGRESS) == 0);
3949         ASSERT(hdr->b_acb == NULL);
3950         if (l2arc)
3951                 hdr->b_flags |= ARC_L2CACHE;
3952         if (l2arc_compress)
3953                 hdr->b_flags |= ARC_L2COMPRESS;
3954         callback = kmem_zalloc(sizeof (arc_write_callback_t), KM_SLEEP);
3955         callback->awcb_ready = ready;
3956         callback->awcb_physdone = physdone;
3957         callback->awcb_done = done;
3958         callback->awcb_private = private;
3959         callback->awcb_buf = buf;
3960
3961         zio = zio_write(pio, spa, txg, bp, buf->b_data, hdr->b_size, zp,
3962             arc_write_ready, arc_write_physdone, arc_write_done, callback,
3963             priority, zio_flags, zb);
3964
3965         return (zio);
3966 }
3967
3968 static int
3969 arc_memory_throttle(uint64_t reserve, uint64_t txg)
3970 {
3971 #ifdef _KERNEL
3972         if (zfs_arc_memory_throttle_disable)
3973                 return (0);
3974
3975         if (freemem <= physmem * arc_lotsfree_percent / 100) {
3976                 ARCSTAT_INCR(arcstat_memory_throttle_count, 1);
3977                 DMU_TX_STAT_BUMP(dmu_tx_memory_reclaim);
3978                 return (SET_ERROR(EAGAIN));
3979         }
3980 #endif
3981         return (0);
3982 }
3983
3984 void
3985 arc_tempreserve_clear(uint64_t reserve)
3986 {
3987         atomic_add_64(&arc_tempreserve, -reserve);
3988         ASSERT((int64_t)arc_tempreserve >= 0);
3989 }
3990
3991 int
3992 arc_tempreserve_space(uint64_t reserve, uint64_t txg)
3993 {
3994         int error;
3995         uint64_t anon_size;
3996
3997         if (reserve > arc_c/4 && !arc_no_grow)
3998                 arc_c = MIN(arc_c_max, reserve * 4);
3999
4000         /*
4001          * Throttle when the calculated memory footprint for the TXG
4002          * exceeds the target ARC size.
4003          */
4004         if (reserve > arc_c) {
4005                 DMU_TX_STAT_BUMP(dmu_tx_memory_reserve);
4006                 return (SET_ERROR(ERESTART));
4007         }
4008
4009         /*
4010          * Don't count loaned bufs as in flight dirty data to prevent long
4011          * network delays from blocking transactions that are ready to be
4012          * assigned to a txg.
4013          */
4014         anon_size = MAX((int64_t)(arc_anon->arcs_size - arc_loaned_bytes), 0);
4015
4016         /*
4017          * Writes will, almost always, require additional memory allocations
4018          * in order to compress/encrypt/etc the data.  We therefore need to
4019          * make sure that there is sufficient available memory for this.
4020          */
4021         error = arc_memory_throttle(reserve, txg);
4022         if (error != 0)
4023                 return (error);
4024
4025         /*
4026          * Throttle writes when the amount of dirty data in the cache
4027          * gets too large.  We try to keep the cache less than half full
4028          * of dirty blocks so that our sync times don't grow too large.
4029          * Note: if two requests come in concurrently, we might let them
4030          * both succeed, when one of them should fail.  Not a huge deal.
4031          */
4032
4033         if (reserve + arc_tempreserve + anon_size > arc_c / 2 &&
4034             anon_size > arc_c / 4) {
4035                 dprintf("failing, arc_tempreserve=%lluK anon_meta=%lluK "
4036                     "anon_data=%lluK tempreserve=%lluK arc_c=%lluK\n",
4037                     arc_tempreserve>>10,
4038                     arc_anon->arcs_lsize[ARC_BUFC_METADATA]>>10,
4039                     arc_anon->arcs_lsize[ARC_BUFC_DATA]>>10,
4040                     reserve>>10, arc_c>>10);
4041                 DMU_TX_STAT_BUMP(dmu_tx_dirty_throttle);
4042                 return (SET_ERROR(ERESTART));
4043         }
4044         atomic_add_64(&arc_tempreserve, reserve);
4045         return (0);
4046 }
4047
4048 static void
4049 arc_kstat_update_state(arc_state_t *state, kstat_named_t *size,
4050     kstat_named_t *evict_data, kstat_named_t *evict_metadata)
4051 {
4052         size->value.ui64 = state->arcs_size;
4053         evict_data->value.ui64 = state->arcs_lsize[ARC_BUFC_DATA];
4054         evict_metadata->value.ui64 = state->arcs_lsize[ARC_BUFC_METADATA];
4055 }
4056
4057 static int
4058 arc_kstat_update(kstat_t *ksp, int rw)
4059 {
4060         arc_stats_t *as = ksp->ks_data;
4061
4062         if (rw == KSTAT_WRITE) {
4063                 return (SET_ERROR(EACCES));
4064         } else {
4065                 arc_kstat_update_state(arc_anon,
4066                     &as->arcstat_anon_size,
4067                     &as->arcstat_anon_evict_data,
4068                     &as->arcstat_anon_evict_metadata);
4069                 arc_kstat_update_state(arc_mru,
4070                     &as->arcstat_mru_size,
4071                     &as->arcstat_mru_evict_data,
4072                     &as->arcstat_mru_evict_metadata);
4073                 arc_kstat_update_state(arc_mru_ghost,
4074                     &as->arcstat_mru_ghost_size,
4075                     &as->arcstat_mru_ghost_evict_data,
4076                     &as->arcstat_mru_ghost_evict_metadata);
4077                 arc_kstat_update_state(arc_mfu,
4078                     &as->arcstat_mfu_size,
4079                     &as->arcstat_mfu_evict_data,
4080                     &as->arcstat_mfu_evict_metadata);
4081                 arc_kstat_update_state(arc_mfu_ghost,
4082                     &as->arcstat_mfu_ghost_size,
4083                     &as->arcstat_mfu_ghost_evict_data,
4084                     &as->arcstat_mfu_ghost_evict_metadata);
4085         }
4086
4087         return (0);
4088 }
4089
4090 void
4091 arc_init(void)
4092 {
4093         mutex_init(&arc_reclaim_thr_lock, NULL, MUTEX_DEFAULT, NULL);
4094         cv_init(&arc_reclaim_thr_cv, NULL, CV_DEFAULT, NULL);
4095
4096         /* Convert seconds to clock ticks */
4097         zfs_arc_min_prefetch_lifespan = 1 * hz;
4098
4099         /* Start out with 1/8 of all memory */
4100         arc_c = physmem * PAGESIZE / 8;
4101
4102 #ifdef _KERNEL
4103         /*
4104          * On architectures where the physical memory can be larger
4105          * than the addressable space (intel in 32-bit mode), we may
4106          * need to limit the cache to 1/8 of VM size.
4107          */
4108         arc_c = MIN(arc_c, vmem_size(heap_arena, VMEM_ALLOC | VMEM_FREE) / 8);
4109         /*
4110          * Register a shrinker to support synchronous (direct) memory
4111          * reclaim from the arc.  This is done to prevent kswapd from
4112          * swapping out pages when it is preferable to shrink the arc.
4113          */
4114         spl_register_shrinker(&arc_shrinker);
4115 #endif
4116
4117         /* set min cache to zero */
4118         arc_c_min = 4<<20;
4119         /* set max to 1/2 of all memory */
4120         arc_c_max = arc_c * 4;
4121
4122         /*
4123          * Allow the tunables to override our calculations if they are
4124          * reasonable (ie. over 64MB)
4125          */
4126         if (zfs_arc_max > 64<<20 && zfs_arc_max < physmem * PAGESIZE)
4127                 arc_c_max = zfs_arc_max;
4128         if (zfs_arc_min > 0 && zfs_arc_min <= arc_c_max)
4129                 arc_c_min = zfs_arc_min;
4130
4131         arc_c = arc_c_max;
4132         arc_p = (arc_c >> 1);
4133
4134         /* limit meta-data to 3/4 of the arc capacity */
4135         arc_meta_limit = (3 * arc_c_max) / 4;
4136         arc_meta_max = 0;
4137
4138         /* Allow the tunable to override if it is reasonable */
4139         if (zfs_arc_meta_limit > 0 && zfs_arc_meta_limit <= arc_c_max)
4140                 arc_meta_limit = zfs_arc_meta_limit;
4141
4142         /* if kmem_flags are set, lets try to use less memory */
4143         if (kmem_debugging())
4144                 arc_c = arc_c / 2;
4145         if (arc_c < arc_c_min)
4146                 arc_c = arc_c_min;
4147
4148         arc_anon = &ARC_anon;
4149         arc_mru = &ARC_mru;
4150         arc_mru_ghost = &ARC_mru_ghost;
4151         arc_mfu = &ARC_mfu;
4152         arc_mfu_ghost = &ARC_mfu_ghost;
4153         arc_l2c_only = &ARC_l2c_only;
4154         arc_size = 0;
4155
4156         mutex_init(&arc_anon->arcs_mtx, NULL, MUTEX_DEFAULT, NULL);
4157         mutex_init(&arc_mru->arcs_mtx, NULL, MUTEX_DEFAULT, NULL);
4158         mutex_init(&arc_mru_ghost->arcs_mtx, NULL, MUTEX_DEFAULT, NULL);
4159         mutex_init(&arc_mfu->arcs_mtx, NULL, MUTEX_DEFAULT, NULL);
4160         mutex_init(&arc_mfu_ghost->arcs_mtx, NULL, MUTEX_DEFAULT, NULL);
4161         mutex_init(&arc_l2c_only->arcs_mtx, NULL, MUTEX_DEFAULT, NULL);
4162
4163         list_create(&arc_mru->arcs_list[ARC_BUFC_METADATA],
4164             sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
4165         list_create(&arc_mru->arcs_list[ARC_BUFC_DATA],
4166             sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
4167         list_create(&arc_mru_ghost->arcs_list[ARC_BUFC_METADATA],
4168             sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
4169         list_create(&arc_mru_ghost->arcs_list[ARC_BUFC_DATA],
4170             sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
4171         list_create(&arc_mfu->arcs_list[ARC_BUFC_METADATA],
4172             sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
4173         list_create(&arc_mfu->arcs_list[ARC_BUFC_DATA],
4174             sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
4175         list_create(&arc_mfu_ghost->arcs_list[ARC_BUFC_METADATA],
4176             sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
4177         list_create(&arc_mfu_ghost->arcs_list[ARC_BUFC_DATA],
4178             sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
4179         list_create(&arc_l2c_only->arcs_list[ARC_BUFC_METADATA],
4180             sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
4181         list_create(&arc_l2c_only->arcs_list[ARC_BUFC_DATA],
4182             sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
4183
4184         arc_anon->arcs_state = ARC_STATE_ANON;
4185         arc_mru->arcs_state = ARC_STATE_MRU;
4186         arc_mru_ghost->arcs_state = ARC_STATE_MRU_GHOST;
4187         arc_mfu->arcs_state = ARC_STATE_MFU;
4188         arc_mfu_ghost->arcs_state = ARC_STATE_MFU_GHOST;
4189         arc_l2c_only->arcs_state = ARC_STATE_L2C_ONLY;
4190
4191         buf_init();
4192
4193         arc_thread_exit = 0;
4194         list_create(&arc_prune_list, sizeof (arc_prune_t),
4195             offsetof(arc_prune_t, p_node));
4196         arc_eviction_list = NULL;
4197         mutex_init(&arc_prune_mtx, NULL, MUTEX_DEFAULT, NULL);
4198         mutex_init(&arc_eviction_mtx, NULL, MUTEX_DEFAULT, NULL);
4199         bzero(&arc_eviction_hdr, sizeof (arc_buf_hdr_t));
4200
4201         arc_ksp = kstat_create("zfs", 0, "arcstats", "misc", KSTAT_TYPE_NAMED,
4202             sizeof (arc_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL);
4203
4204         if (arc_ksp != NULL) {
4205                 arc_ksp->ks_data = &arc_stats;
4206                 arc_ksp->ks_update = arc_kstat_update;
4207                 kstat_install(arc_ksp);
4208         }
4209
4210         (void) thread_create(NULL, 0, arc_adapt_thread, NULL, 0, &p0,
4211             TS_RUN, minclsyspri);
4212
4213         arc_dead = FALSE;
4214         arc_warm = B_FALSE;
4215
4216         /*
4217          * Calculate maximum amount of dirty data per pool.
4218          *
4219          * If it has been set by a module parameter, take that.
4220          * Otherwise, use a percentage of physical memory defined by
4221          * zfs_dirty_data_max_percent (default 10%) with a cap at
4222          * zfs_dirty_data_max_max (default 25% of physical memory).
4223          */
4224         if (zfs_dirty_data_max_max == 0)
4225                 zfs_dirty_data_max_max = physmem * PAGESIZE *
4226                     zfs_dirty_data_max_max_percent / 100;
4227
4228         if (zfs_dirty_data_max == 0) {
4229                 zfs_dirty_data_max = physmem * PAGESIZE *
4230                     zfs_dirty_data_max_percent / 100;
4231                 zfs_dirty_data_max = MIN(zfs_dirty_data_max,
4232                     zfs_dirty_data_max_max);
4233         }
4234 }
4235
4236 void
4237 arc_fini(void)
4238 {
4239         arc_prune_t *p;
4240
4241         mutex_enter(&arc_reclaim_thr_lock);
4242 #ifdef _KERNEL
4243         spl_unregister_shrinker(&arc_shrinker);
4244 #endif /* _KERNEL */
4245
4246         arc_thread_exit = 1;
4247         while (arc_thread_exit != 0)
4248                 cv_wait(&arc_reclaim_thr_cv, &arc_reclaim_thr_lock);
4249         mutex_exit(&arc_reclaim_thr_lock);
4250
4251         arc_flush(NULL);
4252
4253         arc_dead = TRUE;
4254
4255         if (arc_ksp != NULL) {
4256                 kstat_delete(arc_ksp);
4257                 arc_ksp = NULL;
4258         }
4259
4260         mutex_enter(&arc_prune_mtx);
4261         while ((p = list_head(&arc_prune_list)) != NULL) {
4262                 list_remove(&arc_prune_list, p);
4263                 refcount_remove(&p->p_refcnt, &arc_prune_list);
4264                 refcount_destroy(&p->p_refcnt);
4265                 kmem_free(p, sizeof (*p));
4266         }
4267         mutex_exit(&arc_prune_mtx);
4268
4269         list_destroy(&arc_prune_list);
4270         mutex_destroy(&arc_prune_mtx);
4271         mutex_destroy(&arc_eviction_mtx);
4272         mutex_destroy(&arc_reclaim_thr_lock);
4273         cv_destroy(&arc_reclaim_thr_cv);
4274
4275         list_destroy(&arc_mru->arcs_list[ARC_BUFC_METADATA]);
4276         list_destroy(&arc_mru_ghost->arcs_list[ARC_BUFC_METADATA]);
4277         list_destroy(&arc_mfu->arcs_list[ARC_BUFC_METADATA]);
4278         list_destroy(&arc_mfu_ghost->arcs_list[ARC_BUFC_METADATA]);
4279         list_destroy(&arc_mru->arcs_list[ARC_BUFC_DATA]);
4280         list_destroy(&arc_mru_ghost->arcs_list[ARC_BUFC_DATA]);
4281         list_destroy(&arc_mfu->arcs_list[ARC_BUFC_DATA]);
4282         list_destroy(&arc_mfu_ghost->arcs_list[ARC_BUFC_DATA]);
4283
4284         mutex_destroy(&arc_anon->arcs_mtx);
4285         mutex_destroy(&arc_mru->arcs_mtx);
4286         mutex_destroy(&arc_mru_ghost->arcs_mtx);
4287         mutex_destroy(&arc_mfu->arcs_mtx);
4288         mutex_destroy(&arc_mfu_ghost->arcs_mtx);
4289         mutex_destroy(&arc_l2c_only->arcs_mtx);
4290
4291         buf_fini();
4292
4293         ASSERT(arc_loaned_bytes == 0);
4294 }
4295
4296 /*
4297  * Level 2 ARC
4298  *
4299  * The level 2 ARC (L2ARC) is a cache layer in-between main memory and disk.
4300  * It uses dedicated storage devices to hold cached data, which are populated
4301  * using large infrequent writes.  The main role of this cache is to boost
4302  * the performance of random read workloads.  The intended L2ARC devices
4303  * include short-stroked disks, solid state disks, and other media with
4304  * substantially faster read latency than disk.
4305  *
4306  *                 +-----------------------+
4307  *                 |         ARC           |
4308  *                 +-----------------------+
4309  *                    |         ^     ^
4310  *                    |         |     |
4311  *      l2arc_feed_thread()    arc_read()
4312  *                    |         |     |
4313  *                    |  l2arc read   |
4314  *                    V         |     |
4315  *               +---------------+    |
4316  *               |     L2ARC     |    |
4317  *               +---------------+    |
4318  *                   |    ^           |
4319  *          l2arc_write() |           |
4320  *                   |    |           |
4321  *                   V    |           |
4322  *                 +-------+      +-------+
4323  *                 | vdev  |      | vdev  |
4324  *                 | cache |      | cache |
4325  *                 +-------+      +-------+
4326  *                 +=========+     .-----.
4327  *                 :  L2ARC  :    |-_____-|
4328  *                 : devices :    | Disks |
4329  *                 +=========+    `-_____-'
4330  *
4331  * Read requests are satisfied from the following sources, in order:
4332  *
4333  *      1) ARC
4334  *      2) vdev cache of L2ARC devices
4335  *      3) L2ARC devices
4336  *      4) vdev cache of disks
4337  *      5) disks
4338  *
4339  * Some L2ARC device types exhibit extremely slow write performance.
4340  * To accommodate for this there are some significant differences between
4341  * the L2ARC and traditional cache design:
4342  *
4343  * 1. There is no eviction path from the ARC to the L2ARC.  Evictions from
4344  * the ARC behave as usual, freeing buffers and placing headers on ghost
4345  * lists.  The ARC does not send buffers to the L2ARC during eviction as
4346  * this would add inflated write latencies for all ARC memory pressure.
4347  *
4348  * 2. The L2ARC attempts to cache data from the ARC before it is evicted.
4349  * It does this by periodically scanning buffers from the eviction-end of
4350  * the MFU and MRU ARC lists, copying them to the L2ARC devices if they are
4351  * not already there. It scans until a headroom of buffers is satisfied,
4352  * which itself is a buffer for ARC eviction. If a compressible buffer is
4353  * found during scanning and selected for writing to an L2ARC device, we
4354  * temporarily boost scanning headroom during the next scan cycle to make
4355  * sure we adapt to compression effects (which might significantly reduce
4356  * the data volume we write to L2ARC). The thread that does this is
4357  * l2arc_feed_thread(), illustrated below; example sizes are included to
4358  * provide a better sense of ratio than this diagram:
4359  *
4360  *             head -->                        tail
4361  *              +---------------------+----------+
4362  *      ARC_mfu |:::::#:::::::::::::::|o#o###o###|-->.   # already on L2ARC
4363  *              +---------------------+----------+   |   o L2ARC eligible
4364  *      ARC_mru |:#:::::::::::::::::::|#o#ooo####|-->|   : ARC buffer
4365  *              +---------------------+----------+   |
4366  *                   15.9 Gbytes      ^ 32 Mbytes    |
4367  *                                 headroom          |
4368  *                                            l2arc_feed_thread()
4369  *                                                   |
4370  *                       l2arc write hand <--[oooo]--'
4371  *                               |           8 Mbyte
4372  *                               |          write max
4373  *                               V
4374  *                +==============================+
4375  *      L2ARC dev |####|#|###|###|    |####| ... |
4376  *                +==============================+
4377  *                           32 Gbytes
4378  *
4379  * 3. If an ARC buffer is copied to the L2ARC but then hit instead of
4380  * evicted, then the L2ARC has cached a buffer much sooner than it probably
4381  * needed to, potentially wasting L2ARC device bandwidth and storage.  It is
4382  * safe to say that this is an uncommon case, since buffers at the end of
4383  * the ARC lists have moved there due to inactivity.
4384  *
4385  * 4. If the ARC evicts faster than the L2ARC can maintain a headroom,
4386  * then the L2ARC simply misses copying some buffers.  This serves as a
4387  * pressure valve to prevent heavy read workloads from both stalling the ARC
4388  * with waits and clogging the L2ARC with writes.  This also helps prevent
4389  * the potential for the L2ARC to churn if it attempts to cache content too
4390  * quickly, such as during backups of the entire pool.
4391  *
4392  * 5. After system boot and before the ARC has filled main memory, there are
4393  * no evictions from the ARC and so the tails of the ARC_mfu and ARC_mru
4394  * lists can remain mostly static.  Instead of searching from tail of these
4395  * lists as pictured, the l2arc_feed_thread() will search from the list heads
4396  * for eligible buffers, greatly increasing its chance of finding them.
4397  *
4398  * The L2ARC device write speed is also boosted during this time so that
4399  * the L2ARC warms up faster.  Since there have been no ARC evictions yet,
4400  * there are no L2ARC reads, and no fear of degrading read performance
4401  * through increased writes.
4402  *
4403  * 6. Writes to the L2ARC devices are grouped and sent in-sequence, so that
4404  * the vdev queue can aggregate them into larger and fewer writes.  Each
4405  * device is written to in a rotor fashion, sweeping writes through
4406  * available space then repeating.
4407  *
4408  * 7. The L2ARC does not store dirty content.  It never needs to flush
4409  * write buffers back to disk based storage.
4410  *
4411  * 8. If an ARC buffer is written (and dirtied) which also exists in the
4412  * L2ARC, the now stale L2ARC buffer is immediately dropped.
4413  *
4414  * The performance of the L2ARC can be tweaked by a number of tunables, which
4415  * may be necessary for different workloads:
4416  *
4417  *      l2arc_write_max         max write bytes per interval
4418  *      l2arc_write_boost       extra write bytes during device warmup
4419  *      l2arc_noprefetch        skip caching prefetched buffers
4420  *      l2arc_nocompress        skip compressing buffers
4421  *      l2arc_headroom          number of max device writes to precache
4422  *      l2arc_headroom_boost    when we find compressed buffers during ARC
4423  *                              scanning, we multiply headroom by this
4424  *                              percentage factor for the next scan cycle,
4425  *                              since more compressed buffers are likely to
4426  *                              be present
4427  *      l2arc_feed_secs         seconds between L2ARC writing
4428  *
4429  * Tunables may be removed or added as future performance improvements are
4430  * integrated, and also may become zpool properties.
4431  *
4432  * There are three key functions that control how the L2ARC warms up:
4433  *
4434  *      l2arc_write_eligible()  check if a buffer is eligible to cache
4435  *      l2arc_write_size()      calculate how much to write
4436  *      l2arc_write_interval()  calculate sleep delay between writes
4437  *
4438  * These three functions determine what to write, how much, and how quickly
4439  * to send writes.
4440  */
4441
4442 static boolean_t
4443 l2arc_write_eligible(uint64_t spa_guid, arc_buf_hdr_t *ab)
4444 {
4445         /*
4446          * A buffer is *not* eligible for the L2ARC if it:
4447          * 1. belongs to a different spa.
4448          * 2. is already cached on the L2ARC.
4449          * 3. has an I/O in progress (it may be an incomplete read).
4450          * 4. is flagged not eligible (zfs property).
4451          */
4452         if (ab->b_spa != spa_guid || ab->b_l2hdr != NULL ||
4453             HDR_IO_IN_PROGRESS(ab) || !HDR_L2CACHE(ab))
4454                 return (B_FALSE);
4455
4456         return (B_TRUE);
4457 }
4458
4459 static uint64_t
4460 l2arc_write_size(void)
4461 {
4462         uint64_t size;
4463
4464         /*
4465          * Make sure our globals have meaningful values in case the user
4466          * altered them.
4467          */
4468         size = l2arc_write_max;
4469         if (size == 0) {
4470                 cmn_err(CE_NOTE, "Bad value for l2arc_write_max, value must "
4471                     "be greater than zero, resetting it to the default (%d)",
4472                     L2ARC_WRITE_SIZE);
4473                 size = l2arc_write_max = L2ARC_WRITE_SIZE;
4474         }
4475
4476         if (arc_warm == B_FALSE)
4477                 size += l2arc_write_boost;
4478
4479         return (size);
4480
4481 }
4482
4483 static clock_t
4484 l2arc_write_interval(clock_t began, uint64_t wanted, uint64_t wrote)
4485 {
4486         clock_t interval, next, now;
4487
4488         /*
4489          * If the ARC lists are busy, increase our write rate; if the
4490          * lists are stale, idle back.  This is achieved by checking
4491          * how much we previously wrote - if it was more than half of
4492          * what we wanted, schedule the next write much sooner.
4493          */
4494         if (l2arc_feed_again && wrote > (wanted / 2))
4495                 interval = (hz * l2arc_feed_min_ms) / 1000;
4496         else
4497                 interval = hz * l2arc_feed_secs;
4498
4499         now = ddi_get_lbolt();
4500         next = MAX(now, MIN(now + interval, began + interval));
4501
4502         return (next);
4503 }
4504
4505 static void
4506 l2arc_hdr_stat_add(void)
4507 {
4508         ARCSTAT_INCR(arcstat_l2_hdr_size, HDR_SIZE);
4509         ARCSTAT_INCR(arcstat_hdr_size, -HDR_SIZE);
4510 }
4511
4512 static void
4513 l2arc_hdr_stat_remove(void)
4514 {
4515         ARCSTAT_INCR(arcstat_l2_hdr_size, -HDR_SIZE);
4516         ARCSTAT_INCR(arcstat_hdr_size, HDR_SIZE);
4517 }
4518
4519 /*
4520  * Cycle through L2ARC devices.  This is how L2ARC load balances.
4521  * If a device is returned, this also returns holding the spa config lock.
4522  */
4523 static l2arc_dev_t *
4524 l2arc_dev_get_next(void)
4525 {
4526         l2arc_dev_t *first, *next = NULL;
4527
4528         /*
4529          * Lock out the removal of spas (spa_namespace_lock), then removal
4530          * of cache devices (l2arc_dev_mtx).  Once a device has been selected,
4531          * both locks will be dropped and a spa config lock held instead.
4532          */
4533         mutex_enter(&spa_namespace_lock);
4534         mutex_enter(&l2arc_dev_mtx);
4535
4536         /* if there are no vdevs, there is nothing to do */
4537         if (l2arc_ndev == 0)
4538                 goto out;
4539
4540         first = NULL;
4541         next = l2arc_dev_last;
4542         do {
4543                 /* loop around the list looking for a non-faulted vdev */
4544                 if (next == NULL) {
4545                         next = list_head(l2arc_dev_list);
4546                 } else {
4547                         next = list_next(l2arc_dev_list, next);
4548                         if (next == NULL)
4549                                 next = list_head(l2arc_dev_list);
4550                 }
4551
4552                 /* if we have come back to the start, bail out */
4553                 if (first == NULL)
4554                         first = next;
4555                 else if (next == first)
4556                         break;
4557
4558         } while (vdev_is_dead(next->l2ad_vdev));
4559
4560         /* if we were unable to find any usable vdevs, return NULL */
4561         if (vdev_is_dead(next->l2ad_vdev))
4562                 next = NULL;
4563
4564         l2arc_dev_last = next;
4565
4566 out:
4567         mutex_exit(&l2arc_dev_mtx);
4568
4569         /*
4570          * Grab the config lock to prevent the 'next' device from being
4571          * removed while we are writing to it.
4572          */
4573         if (next != NULL)
4574                 spa_config_enter(next->l2ad_spa, SCL_L2ARC, next, RW_READER);
4575         mutex_exit(&spa_namespace_lock);
4576
4577         return (next);
4578 }
4579
4580 /*
4581  * Free buffers that were tagged for destruction.
4582  */
4583 static void
4584 l2arc_do_free_on_write(void)
4585 {
4586         list_t *buflist;
4587         l2arc_data_free_t *df, *df_prev;
4588
4589         mutex_enter(&l2arc_free_on_write_mtx);
4590         buflist = l2arc_free_on_write;
4591
4592         for (df = list_tail(buflist); df; df = df_prev) {
4593                 df_prev = list_prev(buflist, df);
4594                 ASSERT(df->l2df_data != NULL);
4595                 ASSERT(df->l2df_func != NULL);
4596                 df->l2df_func(df->l2df_data, df->l2df_size);
4597                 list_remove(buflist, df);
4598                 kmem_free(df, sizeof (l2arc_data_free_t));
4599         }
4600
4601         mutex_exit(&l2arc_free_on_write_mtx);
4602 }
4603
4604 /*
4605  * A write to a cache device has completed.  Update all headers to allow
4606  * reads from these buffers to begin.
4607  */
4608 static void
4609 l2arc_write_done(zio_t *zio)
4610 {
4611         l2arc_write_callback_t *cb;
4612         l2arc_dev_t *dev;
4613         list_t *buflist;
4614         arc_buf_hdr_t *head, *ab, *ab_prev;
4615         l2arc_buf_hdr_t *abl2;
4616         kmutex_t *hash_lock;
4617         int64_t bytes_dropped = 0;
4618
4619         cb = zio->io_private;
4620         ASSERT(cb != NULL);
4621         dev = cb->l2wcb_dev;
4622         ASSERT(dev != NULL);
4623         head = cb->l2wcb_head;
4624         ASSERT(head != NULL);
4625         buflist = dev->l2ad_buflist;
4626         ASSERT(buflist != NULL);
4627         DTRACE_PROBE2(l2arc__iodone, zio_t *, zio,
4628             l2arc_write_callback_t *, cb);
4629
4630         if (zio->io_error != 0)
4631                 ARCSTAT_BUMP(arcstat_l2_writes_error);
4632
4633         mutex_enter(&l2arc_buflist_mtx);
4634
4635         /*
4636          * All writes completed, or an error was hit.
4637          */
4638         for (ab = list_prev(buflist, head); ab; ab = ab_prev) {
4639                 ab_prev = list_prev(buflist, ab);
4640                 abl2 = ab->b_l2hdr;
4641
4642                 /*
4643                  * Release the temporary compressed buffer as soon as possible.
4644                  */
4645                 if (abl2->b_compress != ZIO_COMPRESS_OFF)
4646                         l2arc_release_cdata_buf(ab);
4647
4648                 hash_lock = HDR_LOCK(ab);
4649                 if (!mutex_tryenter(hash_lock)) {
4650                         /*
4651                          * This buffer misses out.  It may be in a stage
4652                          * of eviction.  Its ARC_L2_WRITING flag will be
4653                          * left set, denying reads to this buffer.
4654                          */
4655                         ARCSTAT_BUMP(arcstat_l2_writes_hdr_miss);
4656                         continue;
4657                 }
4658
4659                 if (zio->io_error != 0) {
4660                         /*
4661                          * Error - drop L2ARC entry.
4662                          */
4663                         list_remove(buflist, ab);
4664                         ARCSTAT_INCR(arcstat_l2_asize, -abl2->b_asize);
4665                         bytes_dropped += abl2->b_asize;
4666                         ab->b_l2hdr = NULL;
4667                         kmem_cache_free(l2arc_hdr_cache, abl2);
4668                         arc_space_return(L2HDR_SIZE, ARC_SPACE_L2HDRS);
4669                         ARCSTAT_INCR(arcstat_l2_size, -ab->b_size);
4670                 }
4671
4672                 /*
4673                  * Allow ARC to begin reads to this L2ARC entry.
4674                  */
4675                 ab->b_flags &= ~ARC_L2_WRITING;
4676
4677                 mutex_exit(hash_lock);
4678         }
4679
4680         atomic_inc_64(&l2arc_writes_done);
4681         list_remove(buflist, head);
4682         kmem_cache_free(hdr_cache, head);
4683         mutex_exit(&l2arc_buflist_mtx);
4684
4685         vdev_space_update(dev->l2ad_vdev, -bytes_dropped, 0, 0);
4686
4687         l2arc_do_free_on_write();
4688
4689         kmem_free(cb, sizeof (l2arc_write_callback_t));
4690 }
4691
4692 /*
4693  * A read to a cache device completed.  Validate buffer contents before
4694  * handing over to the regular ARC routines.
4695  */
4696 static void
4697 l2arc_read_done(zio_t *zio)
4698 {
4699         l2arc_read_callback_t *cb;
4700         arc_buf_hdr_t *hdr;
4701         arc_buf_t *buf;
4702         kmutex_t *hash_lock;
4703         int equal;
4704
4705         ASSERT(zio->io_vd != NULL);
4706         ASSERT(zio->io_flags & ZIO_FLAG_DONT_PROPAGATE);
4707
4708         spa_config_exit(zio->io_spa, SCL_L2ARC, zio->io_vd);
4709
4710         cb = zio->io_private;
4711         ASSERT(cb != NULL);
4712         buf = cb->l2rcb_buf;
4713         ASSERT(buf != NULL);
4714
4715         hash_lock = HDR_LOCK(buf->b_hdr);
4716         mutex_enter(hash_lock);
4717         hdr = buf->b_hdr;
4718         ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
4719
4720         /*
4721          * If the buffer was compressed, decompress it first.
4722          */
4723         if (cb->l2rcb_compress != ZIO_COMPRESS_OFF)
4724                 l2arc_decompress_zio(zio, hdr, cb->l2rcb_compress);
4725         ASSERT(zio->io_data != NULL);
4726
4727         /*
4728          * Check this survived the L2ARC journey.
4729          */
4730         equal = arc_cksum_equal(buf);
4731         if (equal && zio->io_error == 0 && !HDR_L2_EVICTED(hdr)) {
4732                 mutex_exit(hash_lock);
4733                 zio->io_private = buf;
4734                 zio->io_bp_copy = cb->l2rcb_bp; /* XXX fix in L2ARC 2.0 */
4735                 zio->io_bp = &zio->io_bp_copy;  /* XXX fix in L2ARC 2.0 */
4736                 arc_read_done(zio);
4737         } else {
4738                 mutex_exit(hash_lock);
4739                 /*
4740                  * Buffer didn't survive caching.  Increment stats and
4741                  * reissue to the original storage device.
4742                  */
4743                 if (zio->io_error != 0) {
4744                         ARCSTAT_BUMP(arcstat_l2_io_error);
4745                 } else {
4746                         zio->io_error = SET_ERROR(EIO);
4747                 }
4748                 if (!equal)
4749                         ARCSTAT_BUMP(arcstat_l2_cksum_bad);
4750
4751                 /*
4752                  * If there's no waiter, issue an async i/o to the primary
4753                  * storage now.  If there *is* a waiter, the caller must
4754                  * issue the i/o in a context where it's OK to block.
4755                  */
4756                 if (zio->io_waiter == NULL) {
4757                         zio_t *pio = zio_unique_parent(zio);
4758
4759                         ASSERT(!pio || pio->io_child_type == ZIO_CHILD_LOGICAL);
4760
4761                         zio_nowait(zio_read(pio, cb->l2rcb_spa, &cb->l2rcb_bp,
4762                             buf->b_data, zio->io_size, arc_read_done, buf,
4763                             zio->io_priority, cb->l2rcb_flags, &cb->l2rcb_zb));
4764                 }
4765         }
4766
4767         kmem_free(cb, sizeof (l2arc_read_callback_t));
4768 }
4769
4770 /*
4771  * This is the list priority from which the L2ARC will search for pages to
4772  * cache.  This is used within loops (0..3) to cycle through lists in the
4773  * desired order.  This order can have a significant effect on cache
4774  * performance.
4775  *
4776  * Currently the metadata lists are hit first, MFU then MRU, followed by
4777  * the data lists.  This function returns a locked list, and also returns
4778  * the lock pointer.
4779  */
4780 static list_t *
4781 l2arc_list_locked(int list_num, kmutex_t **lock)
4782 {
4783         list_t *list = NULL;
4784
4785         ASSERT(list_num >= 0 && list_num <= 3);
4786
4787         switch (list_num) {
4788         case 0:
4789                 list = &arc_mfu->arcs_list[ARC_BUFC_METADATA];
4790                 *lock = &arc_mfu->arcs_mtx;
4791                 break;
4792         case 1:
4793                 list = &arc_mru->arcs_list[ARC_BUFC_METADATA];
4794                 *lock = &arc_mru->arcs_mtx;
4795                 break;
4796         case 2:
4797                 list = &arc_mfu->arcs_list[ARC_BUFC_DATA];
4798                 *lock = &arc_mfu->arcs_mtx;
4799                 break;
4800         case 3:
4801                 list = &arc_mru->arcs_list[ARC_BUFC_DATA];
4802                 *lock = &arc_mru->arcs_mtx;
4803                 break;
4804         }
4805
4806         ASSERT(!(MUTEX_HELD(*lock)));
4807         mutex_enter(*lock);
4808         return (list);
4809 }
4810
4811 /*
4812  * Evict buffers from the device write hand to the distance specified in
4813  * bytes.  This distance may span populated buffers, it may span nothing.
4814  * This is clearing a region on the L2ARC device ready for writing.
4815  * If the 'all' boolean is set, every buffer is evicted.
4816  */
4817 static void
4818 l2arc_evict(l2arc_dev_t *dev, uint64_t distance, boolean_t all)
4819 {
4820         list_t *buflist;
4821         l2arc_buf_hdr_t *abl2;
4822         arc_buf_hdr_t *ab, *ab_prev;
4823         kmutex_t *hash_lock;
4824         uint64_t taddr;
4825         int64_t bytes_evicted = 0;
4826
4827         buflist = dev->l2ad_buflist;
4828
4829         if (buflist == NULL)
4830                 return;
4831
4832         if (!all && dev->l2ad_first) {
4833                 /*
4834                  * This is the first sweep through the device.  There is
4835                  * nothing to evict.
4836                  */
4837                 return;
4838         }
4839
4840         if (dev->l2ad_hand >= (dev->l2ad_end - (2 * distance))) {
4841                 /*
4842                  * When nearing the end of the device, evict to the end
4843                  * before the device write hand jumps to the start.
4844                  */
4845                 taddr = dev->l2ad_end;
4846         } else {
4847                 taddr = dev->l2ad_hand + distance;
4848         }
4849         DTRACE_PROBE4(l2arc__evict, l2arc_dev_t *, dev, list_t *, buflist,
4850             uint64_t, taddr, boolean_t, all);
4851
4852 top:
4853         mutex_enter(&l2arc_buflist_mtx);
4854         for (ab = list_tail(buflist); ab; ab = ab_prev) {
4855                 ab_prev = list_prev(buflist, ab);
4856
4857                 hash_lock = HDR_LOCK(ab);
4858                 if (!mutex_tryenter(hash_lock)) {
4859                         /*
4860                          * Missed the hash lock.  Retry.
4861                          */
4862                         ARCSTAT_BUMP(arcstat_l2_evict_lock_retry);
4863                         mutex_exit(&l2arc_buflist_mtx);
4864                         mutex_enter(hash_lock);
4865                         mutex_exit(hash_lock);
4866                         goto top;
4867                 }
4868
4869                 if (HDR_L2_WRITE_HEAD(ab)) {
4870                         /*
4871                          * We hit a write head node.  Leave it for
4872                          * l2arc_write_done().
4873                          */
4874                         list_remove(buflist, ab);
4875                         mutex_exit(hash_lock);
4876                         continue;
4877                 }
4878
4879                 if (!all && ab->b_l2hdr != NULL &&
4880                     (ab->b_l2hdr->b_daddr > taddr ||
4881                     ab->b_l2hdr->b_daddr < dev->l2ad_hand)) {
4882                         /*
4883                          * We've evicted to the target address,
4884                          * or the end of the device.
4885                          */
4886                         mutex_exit(hash_lock);
4887                         break;
4888                 }
4889
4890                 if (HDR_FREE_IN_PROGRESS(ab)) {
4891                         /*
4892                          * Already on the path to destruction.
4893                          */
4894                         mutex_exit(hash_lock);
4895                         continue;
4896                 }
4897
4898                 if (ab->b_state == arc_l2c_only) {
4899                         ASSERT(!HDR_L2_READING(ab));
4900                         /*
4901                          * This doesn't exist in the ARC.  Destroy.
4902                          * arc_hdr_destroy() will call list_remove()
4903                          * and decrement arcstat_l2_size.
4904                          */
4905                         arc_change_state(arc_anon, ab, hash_lock);
4906                         arc_hdr_destroy(ab);
4907                 } else {
4908                         /*
4909                          * Invalidate issued or about to be issued
4910                          * reads, since we may be about to write
4911                          * over this location.
4912                          */
4913                         if (HDR_L2_READING(ab)) {
4914                                 ARCSTAT_BUMP(arcstat_l2_evict_reading);
4915                                 ab->b_flags |= ARC_L2_EVICTED;
4916                         }
4917
4918                         /*
4919                          * Tell ARC this no longer exists in L2ARC.
4920                          */
4921                         if (ab->b_l2hdr != NULL) {
4922                                 abl2 = ab->b_l2hdr;
4923                                 ARCSTAT_INCR(arcstat_l2_asize, -abl2->b_asize);
4924                                 bytes_evicted += abl2->b_asize;
4925                                 ab->b_l2hdr = NULL;
4926                                 /*
4927                                  * We are destroying l2hdr, so ensure that
4928                                  * its compressed buffer, if any, is not leaked.
4929                                  */
4930                                 ASSERT(abl2->b_tmp_cdata == NULL);
4931                                 kmem_cache_free(l2arc_hdr_cache, abl2);
4932                                 arc_space_return(L2HDR_SIZE, ARC_SPACE_L2HDRS);
4933                                 ARCSTAT_INCR(arcstat_l2_size, -ab->b_size);
4934                         }
4935                         list_remove(buflist, ab);
4936
4937                         /*
4938                          * This may have been leftover after a
4939                          * failed write.
4940                          */
4941                         ab->b_flags &= ~ARC_L2_WRITING;
4942                 }
4943                 mutex_exit(hash_lock);
4944         }
4945         mutex_exit(&l2arc_buflist_mtx);
4946
4947         vdev_space_update(dev->l2ad_vdev, -bytes_evicted, 0, 0);
4948         dev->l2ad_evict = taddr;
4949 }
4950
4951 /*
4952  * Find and write ARC buffers to the L2ARC device.
4953  *
4954  * An ARC_L2_WRITING flag is set so that the L2ARC buffers are not valid
4955  * for reading until they have completed writing.
4956  * The headroom_boost is an in-out parameter used to maintain headroom boost
4957  * state between calls to this function.
4958  *
4959  * Returns the number of bytes actually written (which may be smaller than
4960  * the delta by which the device hand has changed due to alignment).
4961  */
4962 static uint64_t
4963 l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz,
4964     boolean_t *headroom_boost)
4965 {
4966         arc_buf_hdr_t *ab, *ab_prev, *head;
4967         list_t *list;
4968         uint64_t write_asize, write_psize, write_sz, headroom,
4969             buf_compress_minsz;
4970         void *buf_data;
4971         kmutex_t *list_lock = NULL;
4972         boolean_t full;
4973         l2arc_write_callback_t *cb;
4974         zio_t *pio, *wzio;
4975         uint64_t guid = spa_load_guid(spa);
4976         int try;
4977         const boolean_t do_headroom_boost = *headroom_boost;
4978
4979         ASSERT(dev->l2ad_vdev != NULL);
4980
4981         /* Lower the flag now, we might want to raise it again later. */
4982         *headroom_boost = B_FALSE;
4983
4984         pio = NULL;
4985         write_sz = write_asize = write_psize = 0;
4986         full = B_FALSE;
4987         head = kmem_cache_alloc(hdr_cache, KM_PUSHPAGE);
4988         head->b_flags |= ARC_L2_WRITE_HEAD;
4989
4990         /*
4991          * We will want to try to compress buffers that are at least 2x the
4992          * device sector size.
4993          */
4994         buf_compress_minsz = 2 << dev->l2ad_vdev->vdev_ashift;
4995
4996         /*
4997          * Copy buffers for L2ARC writing.
4998          */
4999         mutex_enter(&l2arc_buflist_mtx);
5000         for (try = 0; try <= 3; try++) {
5001                 uint64_t passed_sz = 0;
5002
5003                 list = l2arc_list_locked(try, &list_lock);
5004
5005                 /*
5006                  * L2ARC fast warmup.
5007                  *
5008                  * Until the ARC is warm and starts to evict, read from the
5009                  * head of the ARC lists rather than the tail.
5010                  */
5011                 if (arc_warm == B_FALSE)
5012                         ab = list_head(list);
5013                 else
5014                         ab = list_tail(list);
5015
5016                 headroom = target_sz * l2arc_headroom;
5017                 if (do_headroom_boost)
5018                         headroom = (headroom * l2arc_headroom_boost) / 100;
5019
5020                 for (; ab; ab = ab_prev) {
5021                         l2arc_buf_hdr_t *l2hdr;
5022                         kmutex_t *hash_lock;
5023                         uint64_t buf_sz;
5024
5025                         if (arc_warm == B_FALSE)
5026                                 ab_prev = list_next(list, ab);
5027                         else
5028                                 ab_prev = list_prev(list, ab);
5029
5030                         hash_lock = HDR_LOCK(ab);
5031                         if (!mutex_tryenter(hash_lock)) {
5032                                 /*
5033                                  * Skip this buffer rather than waiting.
5034                                  */
5035                                 continue;
5036                         }
5037
5038                         passed_sz += ab->b_size;
5039                         if (passed_sz > headroom) {
5040                                 /*
5041                                  * Searched too far.
5042                                  */
5043                                 mutex_exit(hash_lock);
5044                                 break;
5045                         }
5046
5047                         if (!l2arc_write_eligible(guid, ab)) {
5048                                 mutex_exit(hash_lock);
5049                                 continue;
5050                         }
5051
5052                         if ((write_sz + ab->b_size) > target_sz) {
5053                                 full = B_TRUE;
5054                                 mutex_exit(hash_lock);
5055                                 break;
5056                         }
5057
5058                         if (pio == NULL) {
5059                                 /*
5060                                  * Insert a dummy header on the buflist so
5061                                  * l2arc_write_done() can find where the
5062                                  * write buffers begin without searching.
5063                                  */
5064                                 list_insert_head(dev->l2ad_buflist, head);
5065
5066                                 cb = kmem_alloc(sizeof (l2arc_write_callback_t),
5067                                     KM_SLEEP);
5068                                 cb->l2wcb_dev = dev;
5069                                 cb->l2wcb_head = head;
5070                                 pio = zio_root(spa, l2arc_write_done, cb,
5071                                     ZIO_FLAG_CANFAIL);
5072                         }
5073
5074                         /*
5075                          * Create and add a new L2ARC header.
5076                          */
5077                         l2hdr = kmem_cache_alloc(l2arc_hdr_cache, KM_SLEEP);
5078                         l2hdr->b_dev = dev;
5079                         l2hdr->b_daddr = 0;
5080                         arc_space_consume(L2HDR_SIZE, ARC_SPACE_L2HDRS);
5081
5082                         ab->b_flags |= ARC_L2_WRITING;
5083
5084                         /*
5085                          * Temporarily stash the data buffer in b_tmp_cdata.
5086                          * The subsequent write step will pick it up from
5087                          * there. This is because can't access ab->b_buf
5088                          * without holding the hash_lock, which we in turn
5089                          * can't access without holding the ARC list locks
5090                          * (which we want to avoid during compression/writing)
5091                          */
5092                         l2hdr->b_compress = ZIO_COMPRESS_OFF;
5093                         l2hdr->b_asize = ab->b_size;
5094                         l2hdr->b_tmp_cdata = ab->b_buf->b_data;
5095                         l2hdr->b_hits = 0;
5096
5097                         buf_sz = ab->b_size;
5098                         ab->b_l2hdr = l2hdr;
5099
5100                         list_insert_head(dev->l2ad_buflist, ab);
5101
5102                         /*
5103                          * Compute and store the buffer cksum before
5104                          * writing.  On debug the cksum is verified first.
5105                          */
5106                         arc_cksum_verify(ab->b_buf);
5107                         arc_cksum_compute(ab->b_buf, B_TRUE);
5108
5109                         mutex_exit(hash_lock);
5110
5111                         write_sz += buf_sz;
5112                 }
5113
5114                 mutex_exit(list_lock);
5115
5116                 if (full == B_TRUE)
5117                         break;
5118         }
5119
5120         /* No buffers selected for writing? */
5121         if (pio == NULL) {
5122                 ASSERT0(write_sz);
5123                 mutex_exit(&l2arc_buflist_mtx);
5124                 kmem_cache_free(hdr_cache, head);
5125                 return (0);
5126         }
5127
5128         /*
5129          * Now start writing the buffers. We're starting at the write head
5130          * and work backwards, retracing the course of the buffer selector
5131          * loop above.
5132          */
5133         for (ab = list_prev(dev->l2ad_buflist, head); ab;
5134             ab = list_prev(dev->l2ad_buflist, ab)) {
5135                 l2arc_buf_hdr_t *l2hdr;
5136                 uint64_t buf_sz;
5137
5138                 /*
5139                  * We shouldn't need to lock the buffer here, since we flagged
5140                  * it as ARC_L2_WRITING in the previous step, but we must take
5141                  * care to only access its L2 cache parameters. In particular,
5142                  * ab->b_buf may be invalid by now due to ARC eviction.
5143                  */
5144                 l2hdr = ab->b_l2hdr;
5145                 l2hdr->b_daddr = dev->l2ad_hand;
5146
5147                 if (!l2arc_nocompress && (ab->b_flags & ARC_L2COMPRESS) &&
5148                     l2hdr->b_asize >= buf_compress_minsz) {
5149                         if (l2arc_compress_buf(l2hdr)) {
5150                                 /*
5151                                  * If compression succeeded, enable headroom
5152                                  * boost on the next scan cycle.
5153                                  */
5154                                 *headroom_boost = B_TRUE;
5155                         }
5156                 }
5157
5158                 /*
5159                  * Pick up the buffer data we had previously stashed away
5160                  * (and now potentially also compressed).
5161                  */
5162                 buf_data = l2hdr->b_tmp_cdata;
5163                 buf_sz = l2hdr->b_asize;
5164
5165                 /*
5166                  * If the data has not been compressed, then clear b_tmp_cdata
5167                  * to make sure that it points only to a temporary compression
5168                  * buffer.
5169                  */
5170                 if (!L2ARC_IS_VALID_COMPRESS(l2hdr->b_compress))
5171                         l2hdr->b_tmp_cdata = NULL;
5172
5173                 /* Compression may have squashed the buffer to zero length. */
5174                 if (buf_sz != 0) {
5175                         uint64_t buf_p_sz;
5176
5177                         wzio = zio_write_phys(pio, dev->l2ad_vdev,
5178                             dev->l2ad_hand, buf_sz, buf_data, ZIO_CHECKSUM_OFF,
5179                             NULL, NULL, ZIO_PRIORITY_ASYNC_WRITE,
5180                             ZIO_FLAG_CANFAIL, B_FALSE);
5181
5182                         DTRACE_PROBE2(l2arc__write, vdev_t *, dev->l2ad_vdev,
5183                             zio_t *, wzio);
5184                         (void) zio_nowait(wzio);
5185
5186                         write_asize += buf_sz;
5187                         /*
5188                          * Keep the clock hand suitably device-aligned.
5189                          */
5190                         buf_p_sz = vdev_psize_to_asize(dev->l2ad_vdev, buf_sz);
5191                         write_psize += buf_p_sz;
5192                         dev->l2ad_hand += buf_p_sz;
5193                 }
5194         }
5195
5196         mutex_exit(&l2arc_buflist_mtx);
5197
5198         ASSERT3U(write_asize, <=, target_sz);
5199         ARCSTAT_BUMP(arcstat_l2_writes_sent);
5200         ARCSTAT_INCR(arcstat_l2_write_bytes, write_asize);
5201         ARCSTAT_INCR(arcstat_l2_size, write_sz);
5202         ARCSTAT_INCR(arcstat_l2_asize, write_asize);
5203         vdev_space_update(dev->l2ad_vdev, write_asize, 0, 0);
5204
5205         /*
5206          * Bump device hand to the device start if it is approaching the end.
5207          * l2arc_evict() will already have evicted ahead for this case.
5208          */
5209         if (dev->l2ad_hand >= (dev->l2ad_end - target_sz)) {
5210                 dev->l2ad_hand = dev->l2ad_start;
5211                 dev->l2ad_evict = dev->l2ad_start;
5212                 dev->l2ad_first = B_FALSE;
5213         }
5214
5215         dev->l2ad_writing = B_TRUE;
5216         (void) zio_wait(pio);
5217         dev->l2ad_writing = B_FALSE;
5218
5219         return (write_asize);
5220 }
5221
5222 /*
5223  * Compresses an L2ARC buffer.
5224  * The data to be compressed must be prefilled in l2hdr->b_tmp_cdata and its
5225  * size in l2hdr->b_asize. This routine tries to compress the data and
5226  * depending on the compression result there are three possible outcomes:
5227  * *) The buffer was incompressible. The original l2hdr contents were left
5228  *    untouched and are ready for writing to an L2 device.
5229  * *) The buffer was all-zeros, so there is no need to write it to an L2
5230  *    device. To indicate this situation b_tmp_cdata is NULL'ed, b_asize is
5231  *    set to zero and b_compress is set to ZIO_COMPRESS_EMPTY.
5232  * *) Compression succeeded and b_tmp_cdata was replaced with a temporary
5233  *    data buffer which holds the compressed data to be written, and b_asize
5234  *    tells us how much data there is. b_compress is set to the appropriate
5235  *    compression algorithm. Once writing is done, invoke
5236  *    l2arc_release_cdata_buf on this l2hdr to free this temporary buffer.
5237  *
5238  * Returns B_TRUE if compression succeeded, or B_FALSE if it didn't (the
5239  * buffer was incompressible).
5240  */
5241 static boolean_t
5242 l2arc_compress_buf(l2arc_buf_hdr_t *l2hdr)
5243 {
5244         void *cdata;
5245         size_t csize, len, rounded;
5246
5247         ASSERT(l2hdr->b_compress == ZIO_COMPRESS_OFF);
5248         ASSERT(l2hdr->b_tmp_cdata != NULL);
5249
5250         len = l2hdr->b_asize;
5251         cdata = zio_data_buf_alloc(len);
5252         csize = zio_compress_data(ZIO_COMPRESS_LZ4, l2hdr->b_tmp_cdata,
5253             cdata, l2hdr->b_asize);
5254
5255         rounded = P2ROUNDUP(csize, (size_t)SPA_MINBLOCKSIZE);
5256         if (rounded > csize) {
5257                 bzero((char *)cdata + csize, rounded - csize);
5258                 csize = rounded;
5259         }
5260
5261         if (csize == 0) {
5262                 /* zero block, indicate that there's nothing to write */
5263                 zio_data_buf_free(cdata, len);
5264                 l2hdr->b_compress = ZIO_COMPRESS_EMPTY;
5265                 l2hdr->b_asize = 0;
5266                 l2hdr->b_tmp_cdata = NULL;
5267                 ARCSTAT_BUMP(arcstat_l2_compress_zeros);
5268                 return (B_TRUE);
5269         } else if (csize > 0 && csize < len) {
5270                 /*
5271                  * Compression succeeded, we'll keep the cdata around for
5272                  * writing and release it afterwards.
5273                  */
5274                 l2hdr->b_compress = ZIO_COMPRESS_LZ4;
5275                 l2hdr->b_asize = csize;
5276                 l2hdr->b_tmp_cdata = cdata;
5277                 ARCSTAT_BUMP(arcstat_l2_compress_successes);
5278                 return (B_TRUE);
5279         } else {
5280                 /*
5281                  * Compression failed, release the compressed buffer.
5282                  * l2hdr will be left unmodified.
5283                  */
5284                 zio_data_buf_free(cdata, len);
5285                 ARCSTAT_BUMP(arcstat_l2_compress_failures);
5286                 return (B_FALSE);
5287         }
5288 }
5289
5290 /*
5291  * Decompresses a zio read back from an l2arc device. On success, the
5292  * underlying zio's io_data buffer is overwritten by the uncompressed
5293  * version. On decompression error (corrupt compressed stream), the
5294  * zio->io_error value is set to signal an I/O error.
5295  *
5296  * Please note that the compressed data stream is not checksummed, so
5297  * if the underlying device is experiencing data corruption, we may feed
5298  * corrupt data to the decompressor, so the decompressor needs to be
5299  * able to handle this situation (LZ4 does).
5300  */
5301 static void
5302 l2arc_decompress_zio(zio_t *zio, arc_buf_hdr_t *hdr, enum zio_compress c)
5303 {
5304         uint64_t csize;
5305         void *cdata;
5306
5307         ASSERT(L2ARC_IS_VALID_COMPRESS(c));
5308
5309         if (zio->io_error != 0) {
5310                 /*
5311                  * An io error has occured, just restore the original io
5312                  * size in preparation for a main pool read.
5313                  */
5314                 zio->io_orig_size = zio->io_size = hdr->b_size;
5315                 return;
5316         }
5317
5318         if (c == ZIO_COMPRESS_EMPTY) {
5319                 /*
5320                  * An empty buffer results in a null zio, which means we
5321                  * need to fill its io_data after we're done restoring the
5322                  * buffer's contents.
5323                  */
5324                 ASSERT(hdr->b_buf != NULL);
5325                 bzero(hdr->b_buf->b_data, hdr->b_size);
5326                 zio->io_data = zio->io_orig_data = hdr->b_buf->b_data;
5327         } else {
5328                 ASSERT(zio->io_data != NULL);
5329                 /*
5330                  * We copy the compressed data from the start of the arc buffer
5331                  * (the zio_read will have pulled in only what we need, the
5332                  * rest is garbage which we will overwrite at decompression)
5333                  * and then decompress back to the ARC data buffer. This way we
5334                  * can minimize copying by simply decompressing back over the
5335                  * original compressed data (rather than decompressing to an
5336                  * aux buffer and then copying back the uncompressed buffer,
5337                  * which is likely to be much larger).
5338                  */
5339                 csize = zio->io_size;
5340                 cdata = zio_data_buf_alloc(csize);
5341                 bcopy(zio->io_data, cdata, csize);
5342                 if (zio_decompress_data(c, cdata, zio->io_data, csize,
5343                     hdr->b_size) != 0)
5344                         zio->io_error = SET_ERROR(EIO);
5345                 zio_data_buf_free(cdata, csize);
5346         }
5347
5348         /* Restore the expected uncompressed IO size. */
5349         zio->io_orig_size = zio->io_size = hdr->b_size;
5350 }
5351
5352 /*
5353  * Releases the temporary b_tmp_cdata buffer in an l2arc header structure.
5354  * This buffer serves as a temporary holder of compressed data while
5355  * the buffer entry is being written to an l2arc device. Once that is
5356  * done, we can dispose of it.
5357  */
5358 static void
5359 l2arc_release_cdata_buf(arc_buf_hdr_t *ab)
5360 {
5361         l2arc_buf_hdr_t *l2hdr = ab->b_l2hdr;
5362
5363         ASSERT(L2ARC_IS_VALID_COMPRESS(l2hdr->b_compress));
5364         if (l2hdr->b_compress != ZIO_COMPRESS_EMPTY) {
5365                 /*
5366                  * If the data was compressed, then we've allocated a
5367                  * temporary buffer for it, so now we need to release it.
5368                  */
5369                 ASSERT(l2hdr->b_tmp_cdata != NULL);
5370                 zio_data_buf_free(l2hdr->b_tmp_cdata, ab->b_size);
5371                 l2hdr->b_tmp_cdata = NULL;
5372         } else {
5373                 ASSERT(l2hdr->b_tmp_cdata == NULL);
5374         }
5375 }
5376
5377 /*
5378  * This thread feeds the L2ARC at regular intervals.  This is the beating
5379  * heart of the L2ARC.
5380  */
5381 static void
5382 l2arc_feed_thread(void)
5383 {
5384         callb_cpr_t cpr;
5385         l2arc_dev_t *dev;
5386         spa_t *spa;
5387         uint64_t size, wrote;
5388         clock_t begin, next = ddi_get_lbolt();
5389         boolean_t headroom_boost = B_FALSE;
5390         fstrans_cookie_t cookie;
5391
5392         CALLB_CPR_INIT(&cpr, &l2arc_feed_thr_lock, callb_generic_cpr, FTAG);
5393
5394         mutex_enter(&l2arc_feed_thr_lock);
5395
5396         cookie = spl_fstrans_mark();
5397         while (l2arc_thread_exit == 0) {
5398                 CALLB_CPR_SAFE_BEGIN(&cpr);
5399                 (void) cv_timedwait_interruptible(&l2arc_feed_thr_cv,
5400                     &l2arc_feed_thr_lock, next);
5401                 CALLB_CPR_SAFE_END(&cpr, &l2arc_feed_thr_lock);
5402                 next = ddi_get_lbolt() + hz;
5403
5404                 /*
5405                  * Quick check for L2ARC devices.
5406                  */
5407                 mutex_enter(&l2arc_dev_mtx);
5408                 if (l2arc_ndev == 0) {
5409                         mutex_exit(&l2arc_dev_mtx);
5410                         continue;
5411                 }
5412                 mutex_exit(&l2arc_dev_mtx);
5413                 begin = ddi_get_lbolt();
5414
5415                 /*
5416                  * This selects the next l2arc device to write to, and in
5417                  * doing so the next spa to feed from: dev->l2ad_spa.   This
5418                  * will return NULL if there are now no l2arc devices or if
5419                  * they are all faulted.
5420                  *
5421                  * If a device is returned, its spa's config lock is also
5422                  * held to prevent device removal.  l2arc_dev_get_next()
5423                  * will grab and release l2arc_dev_mtx.
5424                  */
5425                 if ((dev = l2arc_dev_get_next()) == NULL)
5426                         continue;
5427
5428                 spa = dev->l2ad_spa;
5429                 ASSERT(spa != NULL);
5430
5431                 /*
5432                  * If the pool is read-only then force the feed thread to
5433                  * sleep a little longer.
5434                  */
5435                 if (!spa_writeable(spa)) {
5436                         next = ddi_get_lbolt() + 5 * l2arc_feed_secs * hz;
5437                         spa_config_exit(spa, SCL_L2ARC, dev);
5438                         continue;
5439                 }
5440
5441                 /*
5442                  * Avoid contributing to memory pressure.
5443                  */
5444                 if (arc_no_grow) {
5445                         ARCSTAT_BUMP(arcstat_l2_abort_lowmem);
5446                         spa_config_exit(spa, SCL_L2ARC, dev);
5447                         continue;
5448                 }
5449
5450                 ARCSTAT_BUMP(arcstat_l2_feeds);
5451
5452                 size = l2arc_write_size();
5453
5454                 /*
5455                  * Evict L2ARC buffers that will be overwritten.
5456                  */
5457                 l2arc_evict(dev, size, B_FALSE);
5458
5459                 /*
5460                  * Write ARC buffers.
5461                  */
5462                 wrote = l2arc_write_buffers(spa, dev, size, &headroom_boost);
5463
5464                 /*
5465                  * Calculate interval between writes.
5466                  */
5467                 next = l2arc_write_interval(begin, size, wrote);
5468                 spa_config_exit(spa, SCL_L2ARC, dev);
5469         }
5470         spl_fstrans_unmark(cookie);
5471
5472         l2arc_thread_exit = 0;
5473         cv_broadcast(&l2arc_feed_thr_cv);
5474         CALLB_CPR_EXIT(&cpr);           /* drops l2arc_feed_thr_lock */
5475         thread_exit();
5476 }
5477
5478 boolean_t
5479 l2arc_vdev_present(vdev_t *vd)
5480 {
5481         l2arc_dev_t *dev;
5482
5483         mutex_enter(&l2arc_dev_mtx);
5484         for (dev = list_head(l2arc_dev_list); dev != NULL;
5485             dev = list_next(l2arc_dev_list, dev)) {
5486                 if (dev->l2ad_vdev == vd)
5487                         break;
5488         }
5489         mutex_exit(&l2arc_dev_mtx);
5490
5491         return (dev != NULL);
5492 }
5493
5494 /*
5495  * Add a vdev for use by the L2ARC.  By this point the spa has already
5496  * validated the vdev and opened it.
5497  */
5498 void
5499 l2arc_add_vdev(spa_t *spa, vdev_t *vd)
5500 {
5501         l2arc_dev_t *adddev;
5502
5503         ASSERT(!l2arc_vdev_present(vd));
5504
5505         /*
5506          * Create a new l2arc device entry.
5507          */
5508         adddev = kmem_zalloc(sizeof (l2arc_dev_t), KM_SLEEP);
5509         adddev->l2ad_spa = spa;
5510         adddev->l2ad_vdev = vd;
5511         adddev->l2ad_start = VDEV_LABEL_START_SIZE;
5512         adddev->l2ad_end = VDEV_LABEL_START_SIZE + vdev_get_min_asize(vd);
5513         adddev->l2ad_hand = adddev->l2ad_start;
5514         adddev->l2ad_evict = adddev->l2ad_start;
5515         adddev->l2ad_first = B_TRUE;
5516         adddev->l2ad_writing = B_FALSE;
5517         list_link_init(&adddev->l2ad_node);
5518
5519         /*
5520          * This is a list of all ARC buffers that are still valid on the
5521          * device.
5522          */
5523         adddev->l2ad_buflist = kmem_zalloc(sizeof (list_t), KM_SLEEP);
5524         list_create(adddev->l2ad_buflist, sizeof (arc_buf_hdr_t),
5525             offsetof(arc_buf_hdr_t, b_l2node));
5526
5527         vdev_space_update(vd, 0, 0, adddev->l2ad_end - adddev->l2ad_hand);
5528
5529         /*
5530          * Add device to global list
5531          */
5532         mutex_enter(&l2arc_dev_mtx);
5533         list_insert_head(l2arc_dev_list, adddev);
5534         atomic_inc_64(&l2arc_ndev);
5535         mutex_exit(&l2arc_dev_mtx);
5536 }
5537
5538 /*
5539  * Remove a vdev from the L2ARC.
5540  */
5541 void
5542 l2arc_remove_vdev(vdev_t *vd)
5543 {
5544         l2arc_dev_t *dev, *nextdev, *remdev = NULL;
5545
5546         /*
5547          * Find the device by vdev
5548          */
5549         mutex_enter(&l2arc_dev_mtx);
5550         for (dev = list_head(l2arc_dev_list); dev; dev = nextdev) {
5551                 nextdev = list_next(l2arc_dev_list, dev);
5552                 if (vd == dev->l2ad_vdev) {
5553                         remdev = dev;
5554                         break;
5555                 }
5556         }
5557         ASSERT(remdev != NULL);
5558
5559         /*
5560          * Remove device from global list
5561          */
5562         list_remove(l2arc_dev_list, remdev);
5563         l2arc_dev_last = NULL;          /* may have been invalidated */
5564         atomic_dec_64(&l2arc_ndev);
5565         mutex_exit(&l2arc_dev_mtx);
5566
5567         /*
5568          * Clear all buflists and ARC references.  L2ARC device flush.
5569          */
5570         l2arc_evict(remdev, 0, B_TRUE);
5571         list_destroy(remdev->l2ad_buflist);
5572         kmem_free(remdev->l2ad_buflist, sizeof (list_t));
5573         kmem_free(remdev, sizeof (l2arc_dev_t));
5574 }
5575
5576 void
5577 l2arc_init(void)
5578 {
5579         l2arc_thread_exit = 0;
5580         l2arc_ndev = 0;
5581         l2arc_writes_sent = 0;
5582         l2arc_writes_done = 0;
5583
5584         mutex_init(&l2arc_feed_thr_lock, NULL, MUTEX_DEFAULT, NULL);
5585         cv_init(&l2arc_feed_thr_cv, NULL, CV_DEFAULT, NULL);
5586         mutex_init(&l2arc_dev_mtx, NULL, MUTEX_DEFAULT, NULL);
5587         mutex_init(&l2arc_buflist_mtx, NULL, MUTEX_DEFAULT, NULL);
5588         mutex_init(&l2arc_free_on_write_mtx, NULL, MUTEX_DEFAULT, NULL);
5589
5590         l2arc_dev_list = &L2ARC_dev_list;
5591         l2arc_free_on_write = &L2ARC_free_on_write;
5592         list_create(l2arc_dev_list, sizeof (l2arc_dev_t),
5593             offsetof(l2arc_dev_t, l2ad_node));
5594         list_create(l2arc_free_on_write, sizeof (l2arc_data_free_t),
5595             offsetof(l2arc_data_free_t, l2df_list_node));
5596 }
5597
5598 void
5599 l2arc_fini(void)
5600 {
5601         /*
5602          * This is called from dmu_fini(), which is called from spa_fini();
5603          * Because of this, we can assume that all l2arc devices have
5604          * already been removed when the pools themselves were removed.
5605          */
5606
5607         l2arc_do_free_on_write();
5608
5609         mutex_destroy(&l2arc_feed_thr_lock);
5610         cv_destroy(&l2arc_feed_thr_cv);
5611         mutex_destroy(&l2arc_dev_mtx);
5612         mutex_destroy(&l2arc_buflist_mtx);
5613         mutex_destroy(&l2arc_free_on_write_mtx);
5614
5615         list_destroy(l2arc_dev_list);
5616         list_destroy(l2arc_free_on_write);
5617 }
5618
5619 void
5620 l2arc_start(void)
5621 {
5622         if (!(spa_mode_global & FWRITE))
5623                 return;
5624
5625         (void) thread_create(NULL, 0, l2arc_feed_thread, NULL, 0, &p0,
5626             TS_RUN, minclsyspri);
5627 }
5628
5629 void
5630 l2arc_stop(void)
5631 {
5632         if (!(spa_mode_global & FWRITE))
5633                 return;
5634
5635         mutex_enter(&l2arc_feed_thr_lock);
5636         cv_signal(&l2arc_feed_thr_cv);  /* kick thread out of startup */
5637         l2arc_thread_exit = 1;
5638         while (l2arc_thread_exit != 0)
5639                 cv_wait(&l2arc_feed_thr_cv, &l2arc_feed_thr_lock);
5640         mutex_exit(&l2arc_feed_thr_lock);
5641 }
5642
5643 #if defined(_KERNEL) && defined(HAVE_SPL)
5644 EXPORT_SYMBOL(arc_buf_size);
5645 EXPORT_SYMBOL(arc_write);
5646 EXPORT_SYMBOL(arc_read);
5647 EXPORT_SYMBOL(arc_buf_remove_ref);
5648 EXPORT_SYMBOL(arc_buf_info);
5649 EXPORT_SYMBOL(arc_getbuf_func);
5650 EXPORT_SYMBOL(arc_add_prune_callback);
5651 EXPORT_SYMBOL(arc_remove_prune_callback);
5652
5653 module_param(zfs_arc_min, ulong, 0644);
5654 MODULE_PARM_DESC(zfs_arc_min, "Min arc size");
5655
5656 module_param(zfs_arc_max, ulong, 0644);
5657 MODULE_PARM_DESC(zfs_arc_max, "Max arc size");
5658
5659 module_param(zfs_arc_meta_limit, ulong, 0644);
5660 MODULE_PARM_DESC(zfs_arc_meta_limit, "Meta limit for arc size");
5661
5662 module_param(zfs_arc_meta_prune, int, 0644);
5663 MODULE_PARM_DESC(zfs_arc_meta_prune, "Meta objects to scan for prune");
5664
5665 module_param(zfs_arc_meta_adjust_restarts, ulong, 0644);
5666 MODULE_PARM_DESC(zfs_arc_meta_adjust_restarts,
5667         "Limit number of restarts in arc_adjust_meta");
5668
5669 module_param(zfs_arc_grow_retry, int, 0644);
5670 MODULE_PARM_DESC(zfs_arc_grow_retry, "Seconds before growing arc size");
5671
5672 module_param(zfs_arc_p_aggressive_disable, int, 0644);
5673 MODULE_PARM_DESC(zfs_arc_p_aggressive_disable, "disable aggressive arc_p grow");
5674
5675 module_param(zfs_arc_p_dampener_disable, int, 0644);
5676 MODULE_PARM_DESC(zfs_arc_p_dampener_disable, "disable arc_p adapt dampener");
5677
5678 module_param(zfs_arc_shrink_shift, int, 0644);
5679 MODULE_PARM_DESC(zfs_arc_shrink_shift, "log2(fraction of arc to reclaim)");
5680
5681 module_param(zfs_disable_dup_eviction, int, 0644);
5682 MODULE_PARM_DESC(zfs_disable_dup_eviction, "disable duplicate buffer eviction");
5683
5684 module_param(zfs_arc_average_blocksize, int, 0444);
5685 MODULE_PARM_DESC(zfs_arc_average_blocksize, "Target average block size");
5686
5687 module_param(zfs_arc_memory_throttle_disable, int, 0644);
5688 MODULE_PARM_DESC(zfs_arc_memory_throttle_disable, "disable memory throttle");
5689
5690 module_param(zfs_arc_min_prefetch_lifespan, int, 0644);
5691 MODULE_PARM_DESC(zfs_arc_min_prefetch_lifespan, "Min life of prefetch block");
5692
5693 module_param(l2arc_write_max, ulong, 0644);
5694 MODULE_PARM_DESC(l2arc_write_max, "Max write bytes per interval");
5695
5696 module_param(l2arc_write_boost, ulong, 0644);
5697 MODULE_PARM_DESC(l2arc_write_boost, "Extra write bytes during device warmup");
5698
5699 module_param(l2arc_headroom, ulong, 0644);
5700 MODULE_PARM_DESC(l2arc_headroom, "Number of max device writes to precache");
5701
5702 module_param(l2arc_headroom_boost, ulong, 0644);
5703 MODULE_PARM_DESC(l2arc_headroom_boost, "Compressed l2arc_headroom multiplier");
5704
5705 module_param(l2arc_feed_secs, ulong, 0644);
5706 MODULE_PARM_DESC(l2arc_feed_secs, "Seconds between L2ARC writing");
5707
5708 module_param(l2arc_feed_min_ms, ulong, 0644);
5709 MODULE_PARM_DESC(l2arc_feed_min_ms, "Min feed interval in milliseconds");
5710
5711 module_param(l2arc_noprefetch, int, 0644);
5712 MODULE_PARM_DESC(l2arc_noprefetch, "Skip caching prefetched buffers");
5713
5714 module_param(l2arc_nocompress, int, 0644);
5715 MODULE_PARM_DESC(l2arc_nocompress, "Skip compressing L2ARC buffers");
5716
5717 module_param(l2arc_feed_again, int, 0644);
5718 MODULE_PARM_DESC(l2arc_feed_again, "Turbo L2ARC warmup");
5719
5720 module_param(l2arc_norw, int, 0644);
5721 MODULE_PARM_DESC(l2arc_norw, "No reads during writes");
5722
5723 #endif