module/zfs/arc.c

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  23  * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
  24  * Copyright (c) 2011 by Delphix. All rights reserved.
  25  * Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
  26  */
  27
  28 /*
  29  * DVA-based Adjustable Replacement Cache
  30  *
  31  * While much of the theory of operation used here is
  32  * based on the self-tuning, low overhead replacement cache
  33  * presented by Megiddo and Modha at FAST 2003, there are some
  34  * significant differences:
  35  *
  36  * 1. The Megiddo and Modha model assumes any page is evictable.
  37  * Pages in its cache cannot be "locked" into memory.  This makes
  38  * the eviction algorithm simple: evict the last page in the list.
  39  * This also make the performance characteristics easy to reason
  40  * about.  Our cache is not so simple.  At any given moment, some
  41  * subset of the blocks in the cache are un-evictable because we
  42  * have handed out a reference to them.  Blocks are only evictable
  43  * when there are no external references active.  This makes
  44  * eviction far more problematic:  we choose to evict the evictable
  45  * blocks that are the "lowest" in the list.
  46  *
  47  * There are times when it is not possible to evict the requested
  48  * space.  In these circumstances we are unable to adjust the cache
  49  * size.  To prevent the cache growing unbounded at these times we
  50  * implement a "cache throttle" that slows the flow of new data
  51  * into the cache until we can make space available.
  52  *
  53  * 2. The Megiddo and Modha model assumes a fixed cache size.
  54  * Pages are evicted when the cache is full and there is a cache
  55  * miss.  Our model has a variable sized cache.  It grows with
  56  * high use, but also tries to react to memory pressure from the
  57  * operating system: decreasing its size when system memory is
  58  * tight.
  59  *
  60  * 3. The Megiddo and Modha model assumes a fixed page size. All
  61  * elements of the cache are therefor exactly the same size.  So
  62  * when adjusting the cache size following a cache miss, its simply
  63  * a matter of choosing a single page to evict.  In our model, we
  64  * have variable sized cache blocks (rangeing from 512 bytes to
  65  * 128K bytes).  We therefor choose a set of blocks to evict to make
  66  * space for a cache miss that approximates as closely as possible
  67  * the space used by the new block.
  68  *
  69  * See also:  "ARC: A Self-Tuning, Low Overhead Replacement Cache"
  70  * by N. Megiddo & D. Modha, FAST 2003
  71  */
  72
  73 /*
  74  * The locking model:
  75  *
  76  * A new reference to a cache buffer can be obtained in two
  77  * ways: 1) via a hash table lookup using the DVA as a key,
  78  * or 2) via one of the ARC lists.  The arc_read() interface
  79  * uses method 1, while the internal arc algorithms for
  80  * adjusting the cache use method 2.  We therefor provide two
  81  * types of locks: 1) the hash table lock array, and 2) the
  82  * arc list locks.
  83  *
  84  * Buffers do not have their own mutexes, rather they rely on the
  85  * hash table mutexes for the bulk of their protection (i.e. most
  86  * fields in the arc_buf_hdr_t are protected by these mutexes).
  87  *
  88  * buf_hash_find() returns the appropriate mutex (held) when it
  89  * locates the requested buffer in the hash table.  It returns
  90  * NULL for the mutex if the buffer was not in the table.
  91  *
  92  * buf_hash_remove() expects the appropriate hash mutex to be
  93  * already held before it is invoked.
  94  *
  95  * Each arc state also has a mutex which is used to protect the
  96  * buffer list associated with the state.  When attempting to
  97  * obtain a hash table lock while holding an arc list lock you
  98  * must use: mutex_tryenter() to avoid deadlock.  Also note that
  99  * the active state mutex must be held before the ghost state mutex.
 100  *
 101  * Arc buffers may have an associated eviction callback function.
 102  * This function will be invoked prior to removing the buffer (e.g.
 103  * in arc_do_user_evicts()).  Note however that the data associated
 104  * with the buffer may be evicted prior to the callback.  The callback
 105  * must be made with *no locks held* (to prevent deadlock).  Additionally,
 106  * the users of callbacks must ensure that their private data is
 107  * protected from simultaneous callbacks from arc_buf_evict()
 108  * and arc_do_user_evicts().
 109  *
 110  * It as also possible to register a callback which is run when the
 111  * arc_meta_limit is reached and no buffers can be safely evicted.  In
 112  * this case the arc user should drop a reference on some arc buffers so
 113  * they can be reclaimed and the arc_meta_limit honored.  For example,
 114  * when using the ZPL each dentry holds a references on a znode.  These
 115  * dentries must be pruned before the arc buffer holding the znode can
 116  * be safely evicted.
 117  *
 118  * Note that the majority of the performance stats are manipulated
 119  * with atomic operations.
 120  *
 121  * The L2ARC uses the l2arc_buflist_mtx global mutex for the following:
 122  *
 123  *      - L2ARC buflist creation
 124  *      - L2ARC buflist eviction
 125  *      - L2ARC write completion, which walks L2ARC buflists
 126  *      - ARC header destruction, as it removes from L2ARC buflists
 127  *      - ARC header release, as it removes from L2ARC buflists
 128  */
 129
 130 #include <sys/spa.h>
 131 #include <sys/zio.h>
 132 #include <sys/zio_compress.h>
 133 #include <sys/zfs_context.h>
 134 #include <sys/arc.h>
 135 #include <sys/vdev.h>
 136 #include <sys/vdev_impl.h>
 137 #ifdef _KERNEL
 138 #include <sys/vmsystm.h>
 139 #include <vm/anon.h>
 140 #include <sys/fs/swapnode.h>
 141 #include <sys/zpl.h>
 142 #endif
 143 #include <sys/callb.h>
 144 #include <sys/kstat.h>
 145 #include <sys/dmu_tx.h>
 146 #include <zfs_fletcher.h>
 147
 148 static kmutex_t         arc_reclaim_thr_lock;
 149 static kcondvar_t       arc_reclaim_thr_cv;     /* used to signal reclaim thr */
 150 static uint8_t          arc_thread_exit;
 151
 152 /* number of bytes to prune from caches when at arc_meta_limit is reached */
 153 int zfs_arc_meta_prune = 1048576;
 154
 155 typedef enum arc_reclaim_strategy {
 156         ARC_RECLAIM_AGGR,               /* Aggressive reclaim strategy */
 157         ARC_RECLAIM_CONS                /* Conservative reclaim strategy */
 158 } arc_reclaim_strategy_t;
 159
 160 /* number of seconds before growing cache again */
 161 int zfs_arc_grow_retry = 5;
 162
 163 /* shift of arc_c for calculating both min and max arc_p */
 164 int zfs_arc_p_min_shift = 4;
 165
 166 /* log2(fraction of arc to reclaim) */
 167 int zfs_arc_shrink_shift = 5;
 168
 169 /*
 170  * minimum lifespan of a prefetch block in clock ticks
 171  * (initialized in arc_init())
 172  */
 173 int zfs_arc_min_prefetch_lifespan = HZ;
 174
 175 /* disable arc proactive arc throttle due to low memory */
 176 int zfs_arc_memory_throttle_disable = 1;
 177
 178 /* disable duplicate buffer eviction */
 179 int zfs_disable_dup_eviction = 0;
 180
 181 static int arc_dead;
 182
 183 /* expiration time for arc_no_grow */
 184 static clock_t arc_grow_time = 0;
 185
 186 /*
 187  * The arc has filled available memory and has now warmed up.
 188  */
 189 static boolean_t arc_warm;
 190
 191 /*
 192  * These tunables are for performance analysis.
 193  */
 194 unsigned long zfs_arc_max = 0;
 195 unsigned long zfs_arc_min = 0;
 196 unsigned long zfs_arc_meta_limit = 0;
 197
 198 /*
 199  * Note that buffers can be in one of 6 states:
 200  *      ARC_anon        - anonymous (discussed below)
 201  *      ARC_mru         - recently used, currently cached
 202  *      ARC_mru_ghost   - recentely used, no longer in cache
 203  *      ARC_mfu         - frequently used, currently cached
 204  *      ARC_mfu_ghost   - frequently used, no longer in cache
 205  *      ARC_l2c_only    - exists in L2ARC but not other states
 206  * When there are no active references to the buffer, they are
 207  * are linked onto a list in one of these arc states.  These are
 208  * the only buffers that can be evicted or deleted.  Within each
 209  * state there are multiple lists, one for meta-data and one for
 210  * non-meta-data.  Meta-data (indirect blocks, blocks of dnodes,
 211  * etc.) is tracked separately so that it can be managed more
 212  * explicitly: favored over data, limited explicitly.
 213  *
 214  * Anonymous buffers are buffers that are not associated with
 215  * a DVA.  These are buffers that hold dirty block copies
 216  * before they are written to stable storage.  By definition,
 217  * they are "ref'd" and are considered part of arc_mru
 218  * that cannot be freed.  Generally, they will aquire a DVA
 219  * as they are written and migrate onto the arc_mru list.
 220  *
 221  * The ARC_l2c_only state is for buffers that are in the second
 222  * level ARC but no longer in any of the ARC_m* lists.  The second
 223  * level ARC itself may also contain buffers that are in any of
 224  * the ARC_m* states - meaning that a buffer can exist in two
 225  * places.  The reason for the ARC_l2c_only state is to keep the
 226  * buffer header in the hash table, so that reads that hit the
 227  * second level ARC benefit from these fast lookups.
 228  */
 229
 230 typedef struct arc_state {
 231         list_t  arcs_list[ARC_BUFC_NUMTYPES];   /* list of evictable buffers */
 232         uint64_t arcs_lsize[ARC_BUFC_NUMTYPES]; /* amount of evictable data */
 233         uint64_t arcs_size;     /* total amount of data in this state */
 234         kmutex_t arcs_mtx;
 235 } arc_state_t;
 236
 237 /* The 6 states: */
 238 static arc_state_t ARC_anon;
 239 static arc_state_t ARC_mru;
 240 static arc_state_t ARC_mru_ghost;
 241 static arc_state_t ARC_mfu;
 242 static arc_state_t ARC_mfu_ghost;
 243 static arc_state_t ARC_l2c_only;
 244
 245 typedef struct arc_stats {
 246         kstat_named_t arcstat_hits;
 247         kstat_named_t arcstat_misses;
 248         kstat_named_t arcstat_demand_data_hits;
 249         kstat_named_t arcstat_demand_data_misses;
 250         kstat_named_t arcstat_demand_metadata_hits;
 251         kstat_named_t arcstat_demand_metadata_misses;
 252         kstat_named_t arcstat_prefetch_data_hits;
 253         kstat_named_t arcstat_prefetch_data_misses;
 254         kstat_named_t arcstat_prefetch_metadata_hits;
 255         kstat_named_t arcstat_prefetch_metadata_misses;
 256         kstat_named_t arcstat_mru_hits;
 257         kstat_named_t arcstat_mru_ghost_hits;
 258         kstat_named_t arcstat_mfu_hits;
 259         kstat_named_t arcstat_mfu_ghost_hits;
 260         kstat_named_t arcstat_deleted;
 261         kstat_named_t arcstat_recycle_miss;
 262         kstat_named_t arcstat_mutex_miss;
 263         kstat_named_t arcstat_evict_skip;
 264         kstat_named_t arcstat_evict_l2_cached;
 265         kstat_named_t arcstat_evict_l2_eligible;
 266         kstat_named_t arcstat_evict_l2_ineligible;
 267         kstat_named_t arcstat_hash_elements;
 268         kstat_named_t arcstat_hash_elements_max;
 269         kstat_named_t arcstat_hash_collisions;
 270         kstat_named_t arcstat_hash_chains;
 271         kstat_named_t arcstat_hash_chain_max;
 272         kstat_named_t arcstat_p;
 273         kstat_named_t arcstat_c;
 274         kstat_named_t arcstat_c_min;
 275         kstat_named_t arcstat_c_max;
 276         kstat_named_t arcstat_size;
 277         kstat_named_t arcstat_hdr_size;
 278         kstat_named_t arcstat_data_size;
 279         kstat_named_t arcstat_other_size;
 280         kstat_named_t arcstat_anon_size;
 281         kstat_named_t arcstat_anon_evict_data;
 282         kstat_named_t arcstat_anon_evict_metadata;
 283         kstat_named_t arcstat_mru_size;
 284         kstat_named_t arcstat_mru_evict_data;
 285         kstat_named_t arcstat_mru_evict_metadata;
 286         kstat_named_t arcstat_mru_ghost_size;
 287         kstat_named_t arcstat_mru_ghost_evict_data;
 288         kstat_named_t arcstat_mru_ghost_evict_metadata;
 289         kstat_named_t arcstat_mfu_size;
 290         kstat_named_t arcstat_mfu_evict_data;
 291         kstat_named_t arcstat_mfu_evict_metadata;
 292         kstat_named_t arcstat_mfu_ghost_size;
 293         kstat_named_t arcstat_mfu_ghost_evict_data;
 294         kstat_named_t arcstat_mfu_ghost_evict_metadata;
 295         kstat_named_t arcstat_l2_hits;
 296         kstat_named_t arcstat_l2_misses;
 297         kstat_named_t arcstat_l2_feeds;
 298         kstat_named_t arcstat_l2_rw_clash;
 299         kstat_named_t arcstat_l2_read_bytes;
 300         kstat_named_t arcstat_l2_write_bytes;
 301         kstat_named_t arcstat_l2_writes_sent;
 302         kstat_named_t arcstat_l2_writes_done;
 303         kstat_named_t arcstat_l2_writes_error;
 304         kstat_named_t arcstat_l2_writes_hdr_miss;
 305         kstat_named_t arcstat_l2_evict_lock_retry;
 306         kstat_named_t arcstat_l2_evict_reading;
 307         kstat_named_t arcstat_l2_free_on_write;
 308         kstat_named_t arcstat_l2_abort_lowmem;
 309         kstat_named_t arcstat_l2_cksum_bad;
 310         kstat_named_t arcstat_l2_io_error;
 311         kstat_named_t arcstat_l2_size;
 312         kstat_named_t arcstat_l2_asize;
 313         kstat_named_t arcstat_l2_hdr_size;
 314         kstat_named_t arcstat_l2_compress_successes;
 315         kstat_named_t arcstat_l2_compress_zeros;
 316         kstat_named_t arcstat_l2_compress_failures;
 317         kstat_named_t arcstat_memory_throttle_count;
 318         kstat_named_t arcstat_duplicate_buffers;
 319         kstat_named_t arcstat_duplicate_buffers_size;
 320         kstat_named_t arcstat_duplicate_reads;
 321         kstat_named_t arcstat_memory_direct_count;
 322         kstat_named_t arcstat_memory_indirect_count;
 323         kstat_named_t arcstat_no_grow;
 324         kstat_named_t arcstat_tempreserve;
 325         kstat_named_t arcstat_loaned_bytes;
 326         kstat_named_t arcstat_prune;
 327         kstat_named_t arcstat_meta_used;
 328         kstat_named_t arcstat_meta_limit;
 329         kstat_named_t arcstat_meta_max;
 330 } arc_stats_t;
 331
 332 static arc_stats_t arc_stats = {
 333         { "hits",                       KSTAT_DATA_UINT64 },
 334         { "misses",                     KSTAT_DATA_UINT64 },
 335         { "demand_data_hits",           KSTAT_DATA_UINT64 },
 336         { "demand_data_misses",         KSTAT_DATA_UINT64 },
 337         { "demand_metadata_hits",       KSTAT_DATA_UINT64 },
 338         { "demand_metadata_misses",     KSTAT_DATA_UINT64 },
 339         { "prefetch_data_hits",         KSTAT_DATA_UINT64 },
 340         { "prefetch_data_misses",       KSTAT_DATA_UINT64 },
 341         { "prefetch_metadata_hits",     KSTAT_DATA_UINT64 },
 342         { "prefetch_metadata_misses",   KSTAT_DATA_UINT64 },
 343         { "mru_hits",                   KSTAT_DATA_UINT64 },
 344         { "mru_ghost_hits",             KSTAT_DATA_UINT64 },
 345         { "mfu_hits",                   KSTAT_DATA_UINT64 },
 346         { "mfu_ghost_hits",             KSTAT_DATA_UINT64 },
 347         { "deleted",                    KSTAT_DATA_UINT64 },
 348         { "recycle_miss",               KSTAT_DATA_UINT64 },
 349         { "mutex_miss",                 KSTAT_DATA_UINT64 },
 350         { "evict_skip",                 KSTAT_DATA_UINT64 },
 351         { "evict_l2_cached",            KSTAT_DATA_UINT64 },
 352         { "evict_l2_eligible",          KSTAT_DATA_UINT64 },
 353         { "evict_l2_ineligible",        KSTAT_DATA_UINT64 },
 354         { "hash_elements",              KSTAT_DATA_UINT64 },
 355         { "hash_elements_max",          KSTAT_DATA_UINT64 },
 356         { "hash_collisions",            KSTAT_DATA_UINT64 },
 357         { "hash_chains",                KSTAT_DATA_UINT64 },
 358         { "hash_chain_max",             KSTAT_DATA_UINT64 },
 359         { "p",                          KSTAT_DATA_UINT64 },
 360         { "c",                          KSTAT_DATA_UINT64 },
 361         { "c_min",                      KSTAT_DATA_UINT64 },
 362         { "c_max",                      KSTAT_DATA_UINT64 },
 363         { "size",                       KSTAT_DATA_UINT64 },
 364         { "hdr_size",                   KSTAT_DATA_UINT64 },
 365         { "data_size",                  KSTAT_DATA_UINT64 },
 366         { "other_size",                 KSTAT_DATA_UINT64 },
 367         { "anon_size",                  KSTAT_DATA_UINT64 },
 368         { "anon_evict_data",            KSTAT_DATA_UINT64 },
 369         { "anon_evict_metadata",        KSTAT_DATA_UINT64 },
 370         { "mru_size",                   KSTAT_DATA_UINT64 },
 371         { "mru_evict_data",             KSTAT_DATA_UINT64 },
 372         { "mru_evict_metadata",         KSTAT_DATA_UINT64 },
 373         { "mru_ghost_size",             KSTAT_DATA_UINT64 },
 374         { "mru_ghost_evict_data",       KSTAT_DATA_UINT64 },
 375         { "mru_ghost_evict_metadata",   KSTAT_DATA_UINT64 },
 376         { "mfu_size",                   KSTAT_DATA_UINT64 },
 377         { "mfu_evict_data",             KSTAT_DATA_UINT64 },
 378         { "mfu_evict_metadata",         KSTAT_DATA_UINT64 },
 379         { "mfu_ghost_size",             KSTAT_DATA_UINT64 },
 380         { "mfu_ghost_evict_data",       KSTAT_DATA_UINT64 },
 381         { "mfu_ghost_evict_metadata",   KSTAT_DATA_UINT64 },
 382         { "l2_hits",                    KSTAT_DATA_UINT64 },
 383         { "l2_misses",                  KSTAT_DATA_UINT64 },
 384         { "l2_feeds",                   KSTAT_DATA_UINT64 },
 385         { "l2_rw_clash",                KSTAT_DATA_UINT64 },
 386         { "l2_read_bytes",              KSTAT_DATA_UINT64 },
 387         { "l2_write_bytes",             KSTAT_DATA_UINT64 },
 388         { "l2_writes_sent",             KSTAT_DATA_UINT64 },
 389         { "l2_writes_done",             KSTAT_DATA_UINT64 },
 390         { "l2_writes_error",            KSTAT_DATA_UINT64 },
 391         { "l2_writes_hdr_miss",         KSTAT_DATA_UINT64 },
 392         { "l2_evict_lock_retry",        KSTAT_DATA_UINT64 },
 393         { "l2_evict_reading",           KSTAT_DATA_UINT64 },
 394         { "l2_free_on_write",           KSTAT_DATA_UINT64 },
 395         { "l2_abort_lowmem",            KSTAT_DATA_UINT64 },
 396         { "l2_cksum_bad",               KSTAT_DATA_UINT64 },
 397         { "l2_io_error",                KSTAT_DATA_UINT64 },
 398         { "l2_size",                    KSTAT_DATA_UINT64 },
 399         { "l2_asize",                   KSTAT_DATA_UINT64 },
 400         { "l2_hdr_size",                KSTAT_DATA_UINT64 },
 401         { "l2_compress_successes",      KSTAT_DATA_UINT64 },
 402         { "l2_compress_zeros",          KSTAT_DATA_UINT64 },
 403         { "l2_compress_failures",       KSTAT_DATA_UINT64 },
 404         { "memory_throttle_count",      KSTAT_DATA_UINT64 },
 405         { "duplicate_buffers",          KSTAT_DATA_UINT64 },
 406         { "duplicate_buffers_size",     KSTAT_DATA_UINT64 },
 407         { "duplicate_reads",            KSTAT_DATA_UINT64 },
 408         { "memory_direct_count",        KSTAT_DATA_UINT64 },
 409         { "memory_indirect_count",      KSTAT_DATA_UINT64 },
 410         { "arc_no_grow",                KSTAT_DATA_UINT64 },
 411         { "arc_tempreserve",            KSTAT_DATA_UINT64 },
 412         { "arc_loaned_bytes",           KSTAT_DATA_UINT64 },
 413         { "arc_prune",                  KSTAT_DATA_UINT64 },
 414         { "arc_meta_used",              KSTAT_DATA_UINT64 },
 415         { "arc_meta_limit",             KSTAT_DATA_UINT64 },
 416         { "arc_meta_max",               KSTAT_DATA_UINT64 },
 417 };
 418
 419 #define ARCSTAT(stat)   (arc_stats.stat.value.ui64)
 420
 421 #define ARCSTAT_INCR(stat, val) \
 422         atomic_add_64(&arc_stats.stat.value.ui64, (val));
 423
 424 #define ARCSTAT_BUMP(stat)      ARCSTAT_INCR(stat, 1)
 425 #define ARCSTAT_BUMPDOWN(stat)  ARCSTAT_INCR(stat, -1)
 426
 427 #define ARCSTAT_MAX(stat, val) {                                        \
 428         uint64_t m;                                                     \
 429         while ((val) > (m = arc_stats.stat.value.ui64) &&               \
 430             (m != atomic_cas_64(&arc_stats.stat.value.ui64, m, (val)))) \
 431                 continue;                                               \
 432 }
 433
 434 #define ARCSTAT_MAXSTAT(stat) \
 435         ARCSTAT_MAX(stat##_max, arc_stats.stat.value.ui64)
 436
 437 /*
 438  * We define a macro to allow ARC hits/misses to be easily broken down by
 439  * two separate conditions, giving a total of four different subtypes for
 440  * each of hits and misses (so eight statistics total).
 441  */
 442 #define ARCSTAT_CONDSTAT(cond1, stat1, notstat1, cond2, stat2, notstat2, stat) \
 443         if (cond1) {                                                    \
 444                 if (cond2) {                                            \
 445                         ARCSTAT_BUMP(arcstat_##stat1##_##stat2##_##stat); \
 446                 } else {                                                \
 447                         ARCSTAT_BUMP(arcstat_##stat1##_##notstat2##_##stat); \
 448                 }                                                       \
 449         } else {                                                        \
 450                 if (cond2) {                                            \
 451                         ARCSTAT_BUMP(arcstat_##notstat1##_##stat2##_##stat); \
 452                 } else {                                                \
 453                         ARCSTAT_BUMP(arcstat_##notstat1##_##notstat2##_##stat);\
 454                 }                                                       \
 455         }
 456
 457 kstat_t                 *arc_ksp;
 458 static arc_state_t      *arc_anon;
 459 static arc_state_t      *arc_mru;
 460 static arc_state_t      *arc_mru_ghost;
 461 static arc_state_t      *arc_mfu;
 462 static arc_state_t      *arc_mfu_ghost;
 463 static arc_state_t      *arc_l2c_only;
 464
 465 /*
 466  * There are several ARC variables that are critical to export as kstats --
 467  * but we don't want to have to grovel around in the kstat whenever we wish to
 468  * manipulate them.  For these variables, we therefore define them to be in
 469  * terms of the statistic variable.  This assures that we are not introducing
 470  * the possibility of inconsistency by having shadow copies of the variables,
 471  * while still allowing the code to be readable.
 472  */
 473 #define arc_size        ARCSTAT(arcstat_size)   /* actual total arc size */
 474 #define arc_p           ARCSTAT(arcstat_p)      /* target size of MRU */
 475 #define arc_c           ARCSTAT(arcstat_c)      /* target size of cache */
 476 #define arc_c_min       ARCSTAT(arcstat_c_min)  /* min target cache size */
 477 #define arc_c_max       ARCSTAT(arcstat_c_max)  /* max target cache size */
 478 #define arc_no_grow     ARCSTAT(arcstat_no_grow)
 479 #define arc_tempreserve ARCSTAT(arcstat_tempreserve)
 480 #define arc_loaned_bytes        ARCSTAT(arcstat_loaned_bytes)
 481 #define arc_meta_used   ARCSTAT(arcstat_meta_used)
 482 #define arc_meta_limit  ARCSTAT(arcstat_meta_limit)
 483 #define arc_meta_max    ARCSTAT(arcstat_meta_max)
 484
 485 #define L2ARC_IS_VALID_COMPRESS(_c_) \
 486         ((_c_) == ZIO_COMPRESS_LZ4 || (_c_) == ZIO_COMPRESS_EMPTY)
 487
 488 typedef struct l2arc_buf_hdr l2arc_buf_hdr_t;
 489
 490 typedef struct arc_callback arc_callback_t;
 491
 492 struct arc_callback {
 493         void                    *acb_private;
 494         arc_done_func_t         *acb_done;
 495         arc_buf_t               *acb_buf;
 496         zio_t                   *acb_zio_dummy;
 497         arc_callback_t          *acb_next;
 498 };
 499
 500 typedef struct arc_write_callback arc_write_callback_t;
 501
 502 struct arc_write_callback {
 503         void            *awcb_private;
 504         arc_done_func_t *awcb_ready;
 505         arc_done_func_t *awcb_done;
 506         arc_buf_t       *awcb_buf;
 507 };
 508
 509 struct arc_buf_hdr {
 510         /* protected by hash lock */
 511         dva_t                   b_dva;
 512         uint64_t                b_birth;
 513         uint64_t                b_cksum0;
 514
 515         kmutex_t                b_freeze_lock;
 516         zio_cksum_t             *b_freeze_cksum;
 517
 518         arc_buf_hdr_t           *b_hash_next;
 519         arc_buf_t               *b_buf;
 520         uint32_t                b_flags;
 521         uint32_t                b_datacnt;
 522
 523         arc_callback_t          *b_acb;
 524         kcondvar_t              b_cv;
 525
 526         /* immutable */
 527         arc_buf_contents_t      b_type;
 528         uint64_t                b_size;
 529         uint64_t                b_spa;
 530
 531         /* protected by arc state mutex */
 532         arc_state_t             *b_state;
 533         list_node_t             b_arc_node;
 534
 535         /* updated atomically */
 536         clock_t                 b_arc_access;
 537
 538         /* self protecting */
 539         refcount_t              b_refcnt;
 540
 541         l2arc_buf_hdr_t         *b_l2hdr;
 542         list_node_t             b_l2node;
 543 };
 544
 545 static list_t arc_prune_list;
 546 static kmutex_t arc_prune_mtx;
 547 static arc_buf_t *arc_eviction_list;
 548 static kmutex_t arc_eviction_mtx;
 549 static arc_buf_hdr_t arc_eviction_hdr;
 550 static void arc_get_data_buf(arc_buf_t *buf);
 551 static void arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock);
 552 static int arc_evict_needed(arc_buf_contents_t type);
 553 static void arc_evict_ghost(arc_state_t *state, uint64_t spa, int64_t bytes);
 554
 555 static boolean_t l2arc_write_eligible(uint64_t spa_guid, arc_buf_hdr_t *ab);
 556
 557 #define GHOST_STATE(state)      \
 558         ((state) == arc_mru_ghost || (state) == arc_mfu_ghost ||        \
 559         (state) == arc_l2c_only)
 560
 561 /*
 562  * Private ARC flags.  These flags are private ARC only flags that will show up
 563  * in b_flags in the arc_hdr_buf_t.  Some flags are publicly declared, and can
 564  * be passed in as arc_flags in things like arc_read.  However, these flags
 565  * should never be passed and should only be set by ARC code.  When adding new
 566  * public flags, make sure not to smash the private ones.
 567  */
 568
 569 #define ARC_IN_HASH_TABLE       (1 << 9)        /* this buffer is hashed */
 570 #define ARC_IO_IN_PROGRESS      (1 << 10)       /* I/O in progress for buf */
 571 #define ARC_IO_ERROR            (1 << 11)       /* I/O failed for buf */
 572 #define ARC_FREED_IN_READ       (1 << 12)       /* buf freed while in read */
 573 #define ARC_BUF_AVAILABLE       (1 << 13)       /* block not in active use */
 574 #define ARC_INDIRECT            (1 << 14)       /* this is an indirect block */
 575 #define ARC_FREE_IN_PROGRESS    (1 << 15)       /* hdr about to be freed */
 576 #define ARC_L2_WRITING          (1 << 16)       /* L2ARC write in progress */
 577 #define ARC_L2_EVICTED          (1 << 17)       /* evicted during I/O */
 578 #define ARC_L2_WRITE_HEAD       (1 << 18)       /* head of write list */
 579
 580 #define HDR_IN_HASH_TABLE(hdr)  ((hdr)->b_flags & ARC_IN_HASH_TABLE)
 581 #define HDR_IO_IN_PROGRESS(hdr) ((hdr)->b_flags & ARC_IO_IN_PROGRESS)
 582 #define HDR_IO_ERROR(hdr)       ((hdr)->b_flags & ARC_IO_ERROR)
 583 #define HDR_PREFETCH(hdr)       ((hdr)->b_flags & ARC_PREFETCH)
 584 #define HDR_FREED_IN_READ(hdr)  ((hdr)->b_flags & ARC_FREED_IN_READ)
 585 #define HDR_BUF_AVAILABLE(hdr)  ((hdr)->b_flags & ARC_BUF_AVAILABLE)
 586 #define HDR_FREE_IN_PROGRESS(hdr)       ((hdr)->b_flags & ARC_FREE_IN_PROGRESS)
 587 #define HDR_L2CACHE(hdr)        ((hdr)->b_flags & ARC_L2CACHE)
 588 #define HDR_L2_READING(hdr)     ((hdr)->b_flags & ARC_IO_IN_PROGRESS && \
 589                                     (hdr)->b_l2hdr != NULL)
 590 #define HDR_L2_WRITING(hdr)     ((hdr)->b_flags & ARC_L2_WRITING)
 591 #define HDR_L2_EVICTED(hdr)     ((hdr)->b_flags & ARC_L2_EVICTED)
 592 #define HDR_L2_WRITE_HEAD(hdr)  ((hdr)->b_flags & ARC_L2_WRITE_HEAD)
 593
 594 /*
 595  * Other sizes
 596  */
 597
 598 #define HDR_SIZE ((int64_t)sizeof (arc_buf_hdr_t))
 599 #define L2HDR_SIZE ((int64_t)sizeof (l2arc_buf_hdr_t))
 600
 601 /*
 602  * Hash table routines
 603  */
 604
 605 #define HT_LOCK_ALIGN   64
 606 #define HT_LOCK_PAD     (P2NPHASE(sizeof (kmutex_t), (HT_LOCK_ALIGN)))
 607
 608 struct ht_lock {
 609         kmutex_t        ht_lock;
 610 #ifdef _KERNEL
 611         unsigned char   pad[HT_LOCK_PAD];
 612 #endif
 613 };
 614
 615 #define BUF_LOCKS 256
 616 typedef struct buf_hash_table {
 617         uint64_t ht_mask;
 618         arc_buf_hdr_t **ht_table;
 619         struct ht_lock ht_locks[BUF_LOCKS];
 620 } buf_hash_table_t;
 621
 622 static buf_hash_table_t buf_hash_table;
 623
 624 #define BUF_HASH_INDEX(spa, dva, birth) \
 625         (buf_hash(spa, dva, birth) & buf_hash_table.ht_mask)
 626 #define BUF_HASH_LOCK_NTRY(idx) (buf_hash_table.ht_locks[idx & (BUF_LOCKS-1)])
 627 #define BUF_HASH_LOCK(idx)      (&(BUF_HASH_LOCK_NTRY(idx).ht_lock))
 628 #define HDR_LOCK(hdr) \
 629         (BUF_HASH_LOCK(BUF_HASH_INDEX(hdr->b_spa, &hdr->b_dva, hdr->b_birth)))
 630
 631 uint64_t zfs_crc64_table[256];
 632
 633 /*
 634  * Level 2 ARC
 635  */
 636
 637 #define L2ARC_WRITE_SIZE        (8 * 1024 * 1024)       /* initial write max */
 638 #define L2ARC_HEADROOM          2                       /* num of writes */
 639 /*
 640  * If we discover during ARC scan any buffers to be compressed, we boost
 641  * our headroom for the next scanning cycle by this percentage multiple.
 642  */
 643 #define L2ARC_HEADROOM_BOOST    200
 644 #define L2ARC_FEED_SECS         1               /* caching interval secs */
 645 #define L2ARC_FEED_MIN_MS       200             /* min caching interval ms */
 646
 647 #define l2arc_writes_sent       ARCSTAT(arcstat_l2_writes_sent)
 648 #define l2arc_writes_done       ARCSTAT(arcstat_l2_writes_done)
 649
 650 /*
 651  * L2ARC Performance Tunables
 652  */
 653 unsigned long l2arc_write_max = L2ARC_WRITE_SIZE;       /* def max write size */
 654 unsigned long l2arc_write_boost = L2ARC_WRITE_SIZE;     /* extra warmup write */
 655 unsigned long l2arc_headroom = L2ARC_HEADROOM;          /* # of dev writes */
 656 unsigned long l2arc_headroom_boost = L2ARC_HEADROOM_BOOST;
 657 unsigned long l2arc_feed_secs = L2ARC_FEED_SECS;        /* interval seconds */
 658 unsigned long l2arc_feed_min_ms = L2ARC_FEED_MIN_MS;    /* min interval msecs */
 659 int l2arc_noprefetch = B_TRUE;                  /* don't cache prefetch bufs */
 660 int l2arc_nocompress = B_FALSE;                 /* don't compress bufs */
 661 int l2arc_feed_again = B_TRUE;                  /* turbo warmup */
 662 int l2arc_norw = B_FALSE;                       /* no reads during writes */
 663
 664 /*
 665  * L2ARC Internals
 666  */
 667 typedef struct l2arc_dev {
 668         vdev_t                  *l2ad_vdev;     /* vdev */
 669         spa_t                   *l2ad_spa;      /* spa */
 670         uint64_t                l2ad_hand;      /* next write location */
 671         uint64_t                l2ad_start;     /* first addr on device */
 672         uint64_t                l2ad_end;       /* last addr on device */
 673         uint64_t                l2ad_evict;     /* last addr eviction reached */
 674         boolean_t               l2ad_first;     /* first sweep through */
 675         boolean_t               l2ad_writing;   /* currently writing */
 676         list_t                  *l2ad_buflist;  /* buffer list */
 677         list_node_t             l2ad_node;      /* device list node */
 678 } l2arc_dev_t;
 679
 680 static list_t L2ARC_dev_list;                   /* device list */
 681 static list_t *l2arc_dev_list;                  /* device list pointer */
 682 static kmutex_t l2arc_dev_mtx;                  /* device list mutex */
 683 static l2arc_dev_t *l2arc_dev_last;             /* last device used */
 684 static kmutex_t l2arc_buflist_mtx;              /* mutex for all buflists */
 685 static list_t L2ARC_free_on_write;              /* free after write buf list */
 686 static list_t *l2arc_free_on_write;             /* free after write list ptr */
 687 static kmutex_t l2arc_free_on_write_mtx;        /* mutex for list */
 688 static uint64_t l2arc_ndev;                     /* number of devices */
 689
 690 typedef struct l2arc_read_callback {
 691         arc_buf_t               *l2rcb_buf;             /* read buffer */
 692         spa_t                   *l2rcb_spa;             /* spa */
 693         blkptr_t                l2rcb_bp;               /* original blkptr */
 694         zbookmark_t             l2rcb_zb;               /* original bookmark */
 695         int                     l2rcb_flags;            /* original flags */
 696         enum zio_compress       l2rcb_compress;         /* applied compress */
 697 } l2arc_read_callback_t;
 698
 699 typedef struct l2arc_write_callback {
 700         l2arc_dev_t     *l2wcb_dev;             /* device info */
 701         arc_buf_hdr_t   *l2wcb_head;            /* head of write buflist */
 702 } l2arc_write_callback_t;
 703
 704 struct l2arc_buf_hdr {
 705         /* protected by arc_buf_hdr  mutex */
 706         l2arc_dev_t             *b_dev;         /* L2ARC device */
 707         uint64_t                b_daddr;        /* disk address, offset byte */
 708         /* compression applied to buffer data */
 709         enum zio_compress       b_compress;
 710         /* real alloc'd buffer size depending on b_compress applied */
 711         int                     b_asize;
 712         /* temporary buffer holder for in-flight compressed data */
 713         void                    *b_tmp_cdata;
 714 };
 715
 716 typedef struct l2arc_data_free {
 717         /* protected by l2arc_free_on_write_mtx */
 718         void            *l2df_data;
 719         size_t          l2df_size;
 720         void            (*l2df_func)(void *, size_t);
 721         list_node_t     l2df_list_node;
 722 } l2arc_data_free_t;
 723
 724 static kmutex_t l2arc_feed_thr_lock;
 725 static kcondvar_t l2arc_feed_thr_cv;
 726 static uint8_t l2arc_thread_exit;
 727
 728 static void l2arc_read_done(zio_t *zio);
 729 static void l2arc_hdr_stat_add(void);
 730 static void l2arc_hdr_stat_remove(void);
 731
 732 static boolean_t l2arc_compress_buf(l2arc_buf_hdr_t *l2hdr);
 733 static void l2arc_decompress_zio(zio_t *zio, arc_buf_hdr_t *hdr,
 734     enum zio_compress c);
 735 static void l2arc_release_cdata_buf(arc_buf_hdr_t *ab);
 736
 737 static uint64_t
 738 buf_hash(uint64_t spa, const dva_t *dva, uint64_t birth)
 739 {
 740         uint8_t *vdva = (uint8_t *)dva;
 741         uint64_t crc = -1ULL;
 742         int i;
 743
 744         ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY);
 745
 746         for (i = 0; i < sizeof (dva_t); i++)
 747                 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ vdva[i]) & 0xFF];
 748
 749         crc ^= (spa>>8) ^ birth;
 750
 751         return (crc);
 752 }
 753
 754 #define BUF_EMPTY(buf)                                          \
 755         ((buf)->b_dva.dva_word[0] == 0 &&                       \
 756         (buf)->b_dva.dva_word[1] == 0 &&                        \
 757         (buf)->b_birth == 0)
 758
 759 #define BUF_EQUAL(spa, dva, birth, buf)                         \
 760         ((buf)->b_dva.dva_word[0] == (dva)->dva_word[0]) &&     \
 761         ((buf)->b_dva.dva_word[1] == (dva)->dva_word[1]) &&     \
 762         ((buf)->b_birth == birth) && ((buf)->b_spa == spa)
 763
 764 static void
 765 buf_discard_identity(arc_buf_hdr_t *hdr)
 766 {
 767         hdr->b_dva.dva_word[0] = 0;
 768         hdr->b_dva.dva_word[1] = 0;
 769         hdr->b_birth = 0;
 770         hdr->b_cksum0 = 0;
 771 }
 772
 773 static arc_buf_hdr_t *
 774 buf_hash_find(uint64_t spa, const dva_t *dva, uint64_t birth, kmutex_t **lockp)
 775 {
 776         uint64_t idx = BUF_HASH_INDEX(spa, dva, birth);
 777         kmutex_t *hash_lock = BUF_HASH_LOCK(idx);
 778         arc_buf_hdr_t *buf;
 779
 780         mutex_enter(hash_lock);
 781         for (buf = buf_hash_table.ht_table[idx]; buf != NULL;
 782             buf = buf->b_hash_next) {
 783                 if (BUF_EQUAL(spa, dva, birth, buf)) {
 784                         *lockp = hash_lock;
 785                         return (buf);
 786                 }
 787         }
 788         mutex_exit(hash_lock);
 789         *lockp = NULL;
 790         return (NULL);
 791 }
 792
 793 /*
 794  * Insert an entry into the hash table.  If there is already an element
 795  * equal to elem in the hash table, then the already existing element
 796  * will be returned and the new element will not be inserted.
 797  * Otherwise returns NULL.
 798  */
 799 static arc_buf_hdr_t *
 800 buf_hash_insert(arc_buf_hdr_t *buf, kmutex_t **lockp)
 801 {
 802         uint64_t idx = BUF_HASH_INDEX(buf->b_spa, &buf->b_dva, buf->b_birth);
 803         kmutex_t *hash_lock = BUF_HASH_LOCK(idx);
 804         arc_buf_hdr_t *fbuf;
 805         uint32_t i;
 806
 807         ASSERT(!HDR_IN_HASH_TABLE(buf));
 808         *lockp = hash_lock;
 809         mutex_enter(hash_lock);
 810         for (fbuf = buf_hash_table.ht_table[idx], i = 0; fbuf != NULL;
 811             fbuf = fbuf->b_hash_next, i++) {
 812                 if (BUF_EQUAL(buf->b_spa, &buf->b_dva, buf->b_birth, fbuf))
 813                         return (fbuf);
 814         }
 815
 816         buf->b_hash_next = buf_hash_table.ht_table[idx];
 817         buf_hash_table.ht_table[idx] = buf;
 818         buf->b_flags |= ARC_IN_HASH_TABLE;
 819
 820         /* collect some hash table performance data */
 821         if (i > 0) {
 822                 ARCSTAT_BUMP(arcstat_hash_collisions);
 823                 if (i == 1)
 824                         ARCSTAT_BUMP(arcstat_hash_chains);
 825
 826                 ARCSTAT_MAX(arcstat_hash_chain_max, i);
 827         }
 828
 829         ARCSTAT_BUMP(arcstat_hash_elements);
 830         ARCSTAT_MAXSTAT(arcstat_hash_elements);
 831
 832         return (NULL);
 833 }
 834
 835 static void
 836 buf_hash_remove(arc_buf_hdr_t *buf)
 837 {
 838         arc_buf_hdr_t *fbuf, **bufp;
 839         uint64_t idx = BUF_HASH_INDEX(buf->b_spa, &buf->b_dva, buf->b_birth);
 840
 841         ASSERT(MUTEX_HELD(BUF_HASH_LOCK(idx)));
 842         ASSERT(HDR_IN_HASH_TABLE(buf));
 843
 844         bufp = &buf_hash_table.ht_table[idx];
 845         while ((fbuf = *bufp) != buf) {
 846                 ASSERT(fbuf != NULL);
 847                 bufp = &fbuf->b_hash_next;
 848         }
 849         *bufp = buf->b_hash_next;
 850         buf->b_hash_next = NULL;
 851         buf->b_flags &= ~ARC_IN_HASH_TABLE;
 852
 853         /* collect some hash table performance data */
 854         ARCSTAT_BUMPDOWN(arcstat_hash_elements);
 855
 856         if (buf_hash_table.ht_table[idx] &&
 857             buf_hash_table.ht_table[idx]->b_hash_next == NULL)
 858                 ARCSTAT_BUMPDOWN(arcstat_hash_chains);
 859 }
 860
 861 /*
 862  * Global data structures and functions for the buf kmem cache.
 863  */
 864 static kmem_cache_t *hdr_cache;
 865 static kmem_cache_t *buf_cache;
 866
 867 static void
 868 buf_fini(void)
 869 {
 870         int i;
 871
 872 #if defined(_KERNEL) && defined(HAVE_SPL)
 873         /* Large allocations which do not require contiguous pages
 874          * should be using vmem_free() in the linux kernel */
 875         vmem_free(buf_hash_table.ht_table,
 876             (buf_hash_table.ht_mask + 1) * sizeof (void *));
 877 #else
 878         kmem_free(buf_hash_table.ht_table,
 879             (buf_hash_table.ht_mask + 1) * sizeof (void *));
 880 #endif
 881         for (i = 0; i < BUF_LOCKS; i++)
 882                 mutex_destroy(&buf_hash_table.ht_locks[i].ht_lock);
 883         kmem_cache_destroy(hdr_cache);
 884         kmem_cache_destroy(buf_cache);
 885 }
 886
 887 /*
 888  * Constructor callback - called when the cache is empty
 889  * and a new buf is requested.
 890  */
 891 /* ARGSUSED */
 892 static int
 893 hdr_cons(void *vbuf, void *unused, int kmflag)
 894 {
 895         arc_buf_hdr_t *buf = vbuf;
 896
 897         bzero(buf, sizeof (arc_buf_hdr_t));
 898         refcount_create(&buf->b_refcnt);
 899         cv_init(&buf->b_cv, NULL, CV_DEFAULT, NULL);
 900         mutex_init(&buf->b_freeze_lock, NULL, MUTEX_DEFAULT, NULL);
 901         list_link_init(&buf->b_arc_node);
 902         list_link_init(&buf->b_l2node);
 903         arc_space_consume(sizeof (arc_buf_hdr_t), ARC_SPACE_HDRS);
 904
 905         return (0);
 906 }
 907
 908 /* ARGSUSED */
 909 static int
 910 buf_cons(void *vbuf, void *unused, int kmflag)
 911 {
 912         arc_buf_t *buf = vbuf;
 913
 914         bzero(buf, sizeof (arc_buf_t));
 915         mutex_init(&buf->b_evict_lock, NULL, MUTEX_DEFAULT, NULL);
 916         arc_space_consume(sizeof (arc_buf_t), ARC_SPACE_HDRS);
 917
 918         return (0);
 919 }
 920
 921 /*
 922  * Destructor callback - called when a cached buf is
 923  * no longer required.
 924  */
 925 /* ARGSUSED */
 926 static void
 927 hdr_dest(void *vbuf, void *unused)
 928 {
 929         arc_buf_hdr_t *buf = vbuf;
 930
 931         ASSERT(BUF_EMPTY(buf));
 932         refcount_destroy(&buf->b_refcnt);
 933         cv_destroy(&buf->b_cv);
 934         mutex_destroy(&buf->b_freeze_lock);
 935         arc_space_return(sizeof (arc_buf_hdr_t), ARC_SPACE_HDRS);
 936 }
 937
 938 /* ARGSUSED */
 939 static void
 940 buf_dest(void *vbuf, void *unused)
 941 {
 942         arc_buf_t *buf = vbuf;
 943
 944         mutex_destroy(&buf->b_evict_lock);
 945         arc_space_return(sizeof (arc_buf_t), ARC_SPACE_HDRS);
 946 }
 947
 948 static void
 949 buf_init(void)
 950 {
 951         uint64_t *ct;
 952         uint64_t hsize = 1ULL << 12;
 953         int i, j;
 954
 955         /*
 956          * The hash table is big enough to fill all of physical memory
 957          * with an average 64K block size.  The table will take up
 958          * totalmem*sizeof(void*)/64K (eg. 128KB/GB with 8-byte pointers).
 959          */
 960         while (hsize * 65536 < physmem * PAGESIZE)
 961                 hsize <<= 1;
 962 retry:
 963         buf_hash_table.ht_mask = hsize - 1;
 964 #if defined(_KERNEL) && defined(HAVE_SPL)
 965         /* Large allocations which do not require contiguous pages
 966          * should be using vmem_alloc() in the linux kernel */
 967         buf_hash_table.ht_table =
 968             vmem_zalloc(hsize * sizeof (void*), KM_SLEEP);
 969 #else
 970         buf_hash_table.ht_table =
 971             kmem_zalloc(hsize * sizeof (void*), KM_NOSLEEP);
 972 #endif
 973         if (buf_hash_table.ht_table == NULL) {
 974                 ASSERT(hsize > (1ULL << 8));
 975                 hsize >>= 1;
 976                 goto retry;
 977         }
 978
 979         hdr_cache = kmem_cache_create("arc_buf_hdr_t", sizeof (arc_buf_hdr_t),
 980             0, hdr_cons, hdr_dest, NULL, NULL, NULL, 0);
 981         buf_cache = kmem_cache_create("arc_buf_t", sizeof (arc_buf_t),
 982             0, buf_cons, buf_dest, NULL, NULL, NULL, 0);
 983
 984         for (i = 0; i < 256; i++)
 985                 for (ct = zfs_crc64_table + i, *ct = i, j = 8; j > 0; j--)
 986                         *ct = (*ct >> 1) ^ (-(*ct & 1) & ZFS_CRC64_POLY);
 987
 988         for (i = 0; i < BUF_LOCKS; i++) {
 989                 mutex_init(&buf_hash_table.ht_locks[i].ht_lock,
 990                     NULL, MUTEX_DEFAULT, NULL);
 991         }
 992 }
 993
 994 #define ARC_MINTIME     (hz>>4) /* 62 ms */
 995
 996 static void
 997 arc_cksum_verify(arc_buf_t *buf)
 998 {
 999         zio_cksum_t zc;
1000
1001         if (!(zfs_flags & ZFS_DEBUG_MODIFY))
1002                 return;
1003
1004         mutex_enter(&buf->b_hdr->b_freeze_lock);
1005         if (buf->b_hdr->b_freeze_cksum == NULL ||
1006             (buf->b_hdr->b_flags & ARC_IO_ERROR)) {
1007                 mutex_exit(&buf->b_hdr->b_freeze_lock);
1008                 return;
1009         }
1010         fletcher_2_native(buf->b_data, buf->b_hdr->b_size, &zc);
1011         if (!ZIO_CHECKSUM_EQUAL(*buf->b_hdr->b_freeze_cksum, zc))
1012                 panic("buffer modified while frozen!");
1013         mutex_exit(&buf->b_hdr->b_freeze_lock);
1014 }
1015
1016 static int
1017 arc_cksum_equal(arc_buf_t *buf)
1018 {
1019         zio_cksum_t zc;
1020         int equal;
1021
1022         mutex_enter(&buf->b_hdr->b_freeze_lock);
1023         fletcher_2_native(buf->b_data, buf->b_hdr->b_size, &zc);
1024         equal = ZIO_CHECKSUM_EQUAL(*buf->b_hdr->b_freeze_cksum, zc);
1025         mutex_exit(&buf->b_hdr->b_freeze_lock);
1026
1027         return (equal);
1028 }
1029
1030 static void
1031 arc_cksum_compute(arc_buf_t *buf, boolean_t force)
1032 {
1033         if (!force && !(zfs_flags & ZFS_DEBUG_MODIFY))
1034                 return;
1035
1036         mutex_enter(&buf->b_hdr->b_freeze_lock);
1037         if (buf->b_hdr->b_freeze_cksum != NULL) {
1038                 mutex_exit(&buf->b_hdr->b_freeze_lock);
1039                 return;
1040         }
1041         buf->b_hdr->b_freeze_cksum = kmem_alloc(sizeof (zio_cksum_t),
1042                                                 KM_PUSHPAGE);
1043         fletcher_2_native(buf->b_data, buf->b_hdr->b_size,
1044             buf->b_hdr->b_freeze_cksum);
1045         mutex_exit(&buf->b_hdr->b_freeze_lock);
1046 }
1047
1048 void
1049 arc_buf_thaw(arc_buf_t *buf)
1050 {
1051         if (zfs_flags & ZFS_DEBUG_MODIFY) {
1052                 if (buf->b_hdr->b_state != arc_anon)
1053                         panic("modifying non-anon buffer!");
1054                 if (buf->b_hdr->b_flags & ARC_IO_IN_PROGRESS)
1055                         panic("modifying buffer while i/o in progress!");
1056                 arc_cksum_verify(buf);
1057         }
1058
1059         mutex_enter(&buf->b_hdr->b_freeze_lock);
1060         if (buf->b_hdr->b_freeze_cksum != NULL) {
1061                 kmem_free(buf->b_hdr->b_freeze_cksum, sizeof (zio_cksum_t));
1062                 buf->b_hdr->b_freeze_cksum = NULL;
1063         }
1064
1065         mutex_exit(&buf->b_hdr->b_freeze_lock);
1066 }
1067
1068 void
1069 arc_buf_freeze(arc_buf_t *buf)
1070 {
1071         kmutex_t *hash_lock;
1072
1073         if (!(zfs_flags & ZFS_DEBUG_MODIFY))
1074                 return;
1075
1076         hash_lock = HDR_LOCK(buf->b_hdr);
1077         mutex_enter(hash_lock);
1078
1079         ASSERT(buf->b_hdr->b_freeze_cksum != NULL ||
1080             buf->b_hdr->b_state == arc_anon);
1081         arc_cksum_compute(buf, B_FALSE);
1082         mutex_exit(hash_lock);
1083 }
1084
1085 static void
1086 add_reference(arc_buf_hdr_t *ab, kmutex_t *hash_lock, void *tag)
1087 {
1088         ASSERT(MUTEX_HELD(hash_lock));
1089
1090         if ((refcount_add(&ab->b_refcnt, tag) == 1) &&
1091             (ab->b_state != arc_anon)) {
1092                 uint64_t delta = ab->b_size * ab->b_datacnt;
1093                 list_t *list = &ab->b_state->arcs_list[ab->b_type];
1094                 uint64_t *size = &ab->b_state->arcs_lsize[ab->b_type];
1095
1096                 ASSERT(!MUTEX_HELD(&ab->b_state->arcs_mtx));
1097                 mutex_enter(&ab->b_state->arcs_mtx);
1098                 ASSERT(list_link_active(&ab->b_arc_node));
1099                 list_remove(list, ab);
1100                 if (GHOST_STATE(ab->b_state)) {
1101                         ASSERT0(ab->b_datacnt);
1102                         ASSERT3P(ab->b_buf, ==, NULL);
1103                         delta = ab->b_size;
1104                 }
1105                 ASSERT(delta > 0);
1106                 ASSERT3U(*size, >=, delta);
1107                 atomic_add_64(size, -delta);
1108                 mutex_exit(&ab->b_state->arcs_mtx);
1109                 /* remove the prefetch flag if we get a reference */
1110                 if (ab->b_flags & ARC_PREFETCH)
1111                         ab->b_flags &= ~ARC_PREFETCH;
1112         }
1113 }
1114
1115 static int
1116 remove_reference(arc_buf_hdr_t *ab, kmutex_t *hash_lock, void *tag)
1117 {
1118         int cnt;
1119         arc_state_t *state = ab->b_state;
1120
1121         ASSERT(state == arc_anon || MUTEX_HELD(hash_lock));
1122         ASSERT(!GHOST_STATE(state));
1123
1124         if (((cnt = refcount_remove(&ab->b_refcnt, tag)) == 0) &&
1125             (state != arc_anon)) {
1126                 uint64_t *size = &state->arcs_lsize[ab->b_type];
1127
1128                 ASSERT(!MUTEX_HELD(&state->arcs_mtx));
1129                 mutex_enter(&state->arcs_mtx);
1130                 ASSERT(!list_link_active(&ab->b_arc_node));
1131                 list_insert_head(&state->arcs_list[ab->b_type], ab);
1132                 ASSERT(ab->b_datacnt > 0);
1133                 atomic_add_64(size, ab->b_size * ab->b_datacnt);
1134                 mutex_exit(&state->arcs_mtx);
1135         }
1136         return (cnt);
1137 }
1138
1139 /*
1140  * Move the supplied buffer to the indicated state.  The mutex
1141  * for the buffer must be held by the caller.
1142  */
1143 static void
1144 arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *ab, kmutex_t *hash_lock)
1145 {
1146         arc_state_t *old_state = ab->b_state;
1147         int64_t refcnt = refcount_count(&ab->b_refcnt);
1148         uint64_t from_delta, to_delta;
1149
1150         ASSERT(MUTEX_HELD(hash_lock));
1151         ASSERT(new_state != old_state);
1152         ASSERT(refcnt == 0 || ab->b_datacnt > 0);
1153         ASSERT(ab->b_datacnt == 0 || !GHOST_STATE(new_state));
1154         ASSERT(ab->b_datacnt <= 1 || old_state != arc_anon);
1155
1156         from_delta = to_delta = ab->b_datacnt * ab->b_size;
1157
1158         /*
1159          * If this buffer is evictable, transfer it from the
1160          * old state list to the new state list.
1161          */
1162         if (refcnt == 0) {
1163                 if (old_state != arc_anon) {
1164                         int use_mutex = !MUTEX_HELD(&old_state->arcs_mtx);
1165                         uint64_t *size = &old_state->arcs_lsize[ab->b_type];
1166
1167                         if (use_mutex)
1168                                 mutex_enter(&old_state->arcs_mtx);
1169
1170                         ASSERT(list_link_active(&ab->b_arc_node));
1171                         list_remove(&old_state->arcs_list[ab->b_type], ab);
1172
1173                         /*
1174                          * If prefetching out of the ghost cache,
1175                          * we will have a non-zero datacnt.
1176                          */
1177                         if (GHOST_STATE(old_state) && ab->b_datacnt == 0) {
1178                                 /* ghost elements have a ghost size */
1179                                 ASSERT(ab->b_buf == NULL);
1180                                 from_delta = ab->b_size;
1181                         }
1182                         ASSERT3U(*size, >=, from_delta);
1183                         atomic_add_64(size, -from_delta);
1184
1185                         if (use_mutex)
1186                                 mutex_exit(&old_state->arcs_mtx);
1187                 }
1188                 if (new_state != arc_anon) {
1189                         int use_mutex = !MUTEX_HELD(&new_state->arcs_mtx);
1190                         uint64_t *size = &new_state->arcs_lsize[ab->b_type];
1191
1192                         if (use_mutex)
1193                                 mutex_enter(&new_state->arcs_mtx);
1194
1195                         list_insert_head(&new_state->arcs_list[ab->b_type], ab);
1196
1197                         /* ghost elements have a ghost size */
1198                         if (GHOST_STATE(new_state)) {
1199                                 ASSERT(ab->b_datacnt == 0);
1200                                 ASSERT(ab->b_buf == NULL);
1201                                 to_delta = ab->b_size;
1202                         }
1203                         atomic_add_64(size, to_delta);
1204
1205                         if (use_mutex)
1206                                 mutex_exit(&new_state->arcs_mtx);
1207                 }
1208         }
1209
1210         ASSERT(!BUF_EMPTY(ab));
1211         if (new_state == arc_anon && HDR_IN_HASH_TABLE(ab))
1212                 buf_hash_remove(ab);
1213
1214         /* adjust state sizes */
1215         if (to_delta)
1216                 atomic_add_64(&new_state->arcs_size, to_delta);
1217         if (from_delta) {
1218                 ASSERT3U(old_state->arcs_size, >=, from_delta);
1219                 atomic_add_64(&old_state->arcs_size, -from_delta);
1220         }
1221         ab->b_state = new_state;
1222
1223         /* adjust l2arc hdr stats */
1224         if (new_state == arc_l2c_only)
1225                 l2arc_hdr_stat_add();
1226         else if (old_state == arc_l2c_only)
1227                 l2arc_hdr_stat_remove();
1228 }
1229
1230 void
1231 arc_space_consume(uint64_t space, arc_space_type_t type)
1232 {
1233         ASSERT(type >= 0 && type < ARC_SPACE_NUMTYPES);
1234
1235         switch (type) {
1236         default:
1237                 break;
1238         case ARC_SPACE_DATA:
1239                 ARCSTAT_INCR(arcstat_data_size, space);
1240                 break;
1241         case ARC_SPACE_OTHER:
1242                 ARCSTAT_INCR(arcstat_other_size, space);
1243                 break;
1244         case ARC_SPACE_HDRS:
1245                 ARCSTAT_INCR(arcstat_hdr_size, space);
1246                 break;
1247         case ARC_SPACE_L2HDRS:
1248                 ARCSTAT_INCR(arcstat_l2_hdr_size, space);
1249                 break;
1250         }
1251
1252         atomic_add_64(&arc_meta_used, space);
1253         atomic_add_64(&arc_size, space);
1254 }
1255
1256 void
1257 arc_space_return(uint64_t space, arc_space_type_t type)
1258 {
1259         ASSERT(type >= 0 && type < ARC_SPACE_NUMTYPES);
1260
1261         switch (type) {
1262         default:
1263                 break;
1264         case ARC_SPACE_DATA:
1265                 ARCSTAT_INCR(arcstat_data_size, -space);
1266                 break;
1267         case ARC_SPACE_OTHER:
1268                 ARCSTAT_INCR(arcstat_other_size, -space);
1269                 break;
1270         case ARC_SPACE_HDRS:
1271                 ARCSTAT_INCR(arcstat_hdr_size, -space);
1272                 break;
1273         case ARC_SPACE_L2HDRS:
1274                 ARCSTAT_INCR(arcstat_l2_hdr_size, -space);
1275                 break;
1276         }
1277
1278         ASSERT(arc_meta_used >= space);
1279         if (arc_meta_max < arc_meta_used)
1280                 arc_meta_max = arc_meta_used;
1281         atomic_add_64(&arc_meta_used, -space);
1282         ASSERT(arc_size >= space);
1283         atomic_add_64(&arc_size, -space);
1284 }
1285
1286 arc_buf_t *
1287 arc_buf_alloc(spa_t *spa, int size, void *tag, arc_buf_contents_t type)
1288 {
1289         arc_buf_hdr_t *hdr;
1290         arc_buf_t *buf;
1291
1292         ASSERT3U(size, >, 0);
1293         hdr = kmem_cache_alloc(hdr_cache, KM_PUSHPAGE);
1294         ASSERT(BUF_EMPTY(hdr));
1295         hdr->b_size = size;
1296         hdr->b_type = type;
1297         hdr->b_spa = spa_load_guid(spa);
1298         hdr->b_state = arc_anon;
1299         hdr->b_arc_access = 0;
1300         buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE);
1301         buf->b_hdr = hdr;
1302         buf->b_data = NULL;
1303         buf->b_efunc = NULL;
1304         buf->b_private = NULL;
1305         buf->b_next = NULL;
1306         hdr->b_buf = buf;
1307         arc_get_data_buf(buf);
1308         hdr->b_datacnt = 1;
1309         hdr->b_flags = 0;
1310         ASSERT(refcount_is_zero(&hdr->b_refcnt));
1311         (void) refcount_add(&hdr->b_refcnt, tag);
1312
1313         return (buf);
1314 }
1315
1316 static char *arc_onloan_tag = "onloan";
1317
1318 /*
1319  * Loan out an anonymous arc buffer. Loaned buffers are not counted as in
1320  * flight data by arc_tempreserve_space() until they are "returned". Loaned
1321  * buffers must be returned to the arc before they can be used by the DMU or
1322  * freed.
1323  */
1324 arc_buf_t *
1325 arc_loan_buf(spa_t *spa, int size)
1326 {
1327         arc_buf_t *buf;
1328
1329         buf = arc_buf_alloc(spa, size, arc_onloan_tag, ARC_BUFC_DATA);
1330
1331         atomic_add_64(&arc_loaned_bytes, size);
1332         return (buf);
1333 }
1334
1335 /*
1336  * Return a loaned arc buffer to the arc.
1337  */
1338 void
1339 arc_return_buf(arc_buf_t *buf, void *tag)
1340 {
1341         arc_buf_hdr_t *hdr = buf->b_hdr;
1342
1343         ASSERT(buf->b_data != NULL);
1344         (void) refcount_add(&hdr->b_refcnt, tag);
1345         (void) refcount_remove(&hdr->b_refcnt, arc_onloan_tag);
1346
1347         atomic_add_64(&arc_loaned_bytes, -hdr->b_size);
1348 }
1349
1350 /* Detach an arc_buf from a dbuf (tag) */
1351 void
1352 arc_loan_inuse_buf(arc_buf_t *buf, void *tag)
1353 {
1354         arc_buf_hdr_t *hdr;
1355
1356         ASSERT(buf->b_data != NULL);
1357         hdr = buf->b_hdr;
1358         (void) refcount_add(&hdr->b_refcnt, arc_onloan_tag);
1359         (void) refcount_remove(&hdr->b_refcnt, tag);
1360         buf->b_efunc = NULL;
1361         buf->b_private = NULL;
1362
1363         atomic_add_64(&arc_loaned_bytes, hdr->b_size);
1364 }
1365
1366 static arc_buf_t *
1367 arc_buf_clone(arc_buf_t *from)
1368 {
1369         arc_buf_t *buf;
1370         arc_buf_hdr_t *hdr = from->b_hdr;
1371         uint64_t size = hdr->b_size;
1372
1373         ASSERT(hdr->b_state != arc_anon);
1374
1375         buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE);
1376         buf->b_hdr = hdr;
1377         buf->b_data = NULL;
1378         buf->b_efunc = NULL;
1379         buf->b_private = NULL;
1380         buf->b_next = hdr->b_buf;
1381         hdr->b_buf = buf;
1382         arc_get_data_buf(buf);
1383         bcopy(from->b_data, buf->b_data, size);
1384
1385         /*
1386          * This buffer already exists in the arc so create a duplicate
1387          * copy for the caller.  If the buffer is associated with user data
1388          * then track the size and number of duplicates.  These stats will be
1389          * updated as duplicate buffers are created and destroyed.
1390          */
1391         if (hdr->b_type == ARC_BUFC_DATA) {
1392                 ARCSTAT_BUMP(arcstat_duplicate_buffers);
1393                 ARCSTAT_INCR(arcstat_duplicate_buffers_size, size);
1394         }
1395         hdr->b_datacnt += 1;
1396         return (buf);
1397 }
1398
1399 void
1400 arc_buf_add_ref(arc_buf_t *buf, void* tag)
1401 {
1402         arc_buf_hdr_t *hdr;
1403         kmutex_t *hash_lock;
1404
1405         /*
1406          * Check to see if this buffer is evicted.  Callers
1407          * must verify b_data != NULL to know if the add_ref
1408          * was successful.
1409          */
1410         mutex_enter(&buf->b_evict_lock);
1411         if (buf->b_data == NULL) {
1412                 mutex_exit(&buf->b_evict_lock);
1413                 return;
1414         }
1415         hash_lock = HDR_LOCK(buf->b_hdr);
1416         mutex_enter(hash_lock);
1417         hdr = buf->b_hdr;
1418         ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
1419         mutex_exit(&buf->b_evict_lock);
1420
1421         ASSERT(hdr->b_state == arc_mru || hdr->b_state == arc_mfu);
1422         add_reference(hdr, hash_lock, tag);
1423         DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr);
1424         arc_access(hdr, hash_lock);
1425         mutex_exit(hash_lock);
1426         ARCSTAT_BUMP(arcstat_hits);
1427         ARCSTAT_CONDSTAT(!(hdr->b_flags & ARC_PREFETCH),
1428             demand, prefetch, hdr->b_type != ARC_BUFC_METADATA,
1429             data, metadata, hits);
1430 }
1431
1432 /*
1433  * Free the arc data buffer.  If it is an l2arc write in progress,
1434  * the buffer is placed on l2arc_free_on_write to be freed later.
1435  */
1436 static void
1437 arc_buf_data_free(arc_buf_hdr_t *hdr, void (*free_func)(void *, size_t),
1438     void *data, size_t size)
1439 {
1440         if (HDR_L2_WRITING(hdr)) {
1441                 l2arc_data_free_t *df;
1442                 df = kmem_alloc(sizeof (l2arc_data_free_t), KM_PUSHPAGE);
1443                 df->l2df_data = data;
1444                 df->l2df_size = size;
1445                 df->l2df_func = free_func;
1446                 mutex_enter(&l2arc_free_on_write_mtx);
1447                 list_insert_head(l2arc_free_on_write, df);
1448                 mutex_exit(&l2arc_free_on_write_mtx);
1449                 ARCSTAT_BUMP(arcstat_l2_free_on_write);
1450         } else {
1451                 free_func(data, size);
1452         }
1453 }
1454
1455 static void
1456 arc_buf_destroy(arc_buf_t *buf, boolean_t recycle, boolean_t all)
1457 {
1458         arc_buf_t **bufp;
1459
1460         /* free up data associated with the buf */
1461         if (buf->b_data) {
1462                 arc_state_t *state = buf->b_hdr->b_state;
1463                 uint64_t size = buf->b_hdr->b_size;
1464                 arc_buf_contents_t type = buf->b_hdr->b_type;
1465
1466                 arc_cksum_verify(buf);
1467
1468                 if (!recycle) {
1469                         if (type == ARC_BUFC_METADATA) {
1470                                 arc_buf_data_free(buf->b_hdr, zio_buf_free,
1471                                     buf->b_data, size);
1472                                 arc_space_return(size, ARC_SPACE_DATA);
1473                         } else {
1474                                 ASSERT(type == ARC_BUFC_DATA);
1475                                 arc_buf_data_free(buf->b_hdr,
1476                                     zio_data_buf_free, buf->b_data, size);
1477                                 ARCSTAT_INCR(arcstat_data_size, -size);
1478                                 atomic_add_64(&arc_size, -size);
1479                         }
1480                 }
1481                 if (list_link_active(&buf->b_hdr->b_arc_node)) {
1482                         uint64_t *cnt = &state->arcs_lsize[type];
1483
1484                         ASSERT(refcount_is_zero(&buf->b_hdr->b_refcnt));
1485                         ASSERT(state != arc_anon);
1486
1487                         ASSERT3U(*cnt, >=, size);
1488                         atomic_add_64(cnt, -size);
1489                 }
1490                 ASSERT3U(state->arcs_size, >=, size);
1491                 atomic_add_64(&state->arcs_size, -size);
1492                 buf->b_data = NULL;
1493
1494                 /*
1495                  * If we're destroying a duplicate buffer make sure
1496                  * that the appropriate statistics are updated.
1497                  */
1498                 if (buf->b_hdr->b_datacnt > 1 &&
1499                     buf->b_hdr->b_type == ARC_BUFC_DATA) {
1500                         ARCSTAT_BUMPDOWN(arcstat_duplicate_buffers);
1501                         ARCSTAT_INCR(arcstat_duplicate_buffers_size, -size);
1502                 }
1503                 ASSERT(buf->b_hdr->b_datacnt > 0);
1504                 buf->b_hdr->b_datacnt -= 1;
1505         }
1506
1507         /* only remove the buf if requested */
1508         if (!all)
1509                 return;
1510
1511         /* remove the buf from the hdr list */
1512         for (bufp = &buf->b_hdr->b_buf; *bufp != buf; bufp = &(*bufp)->b_next)
1513                 continue;
1514         *bufp = buf->b_next;
1515         buf->b_next = NULL;
1516
1517         ASSERT(buf->b_efunc == NULL);
1518
1519         /* clean up the buf */
1520         buf->b_hdr = NULL;
1521         kmem_cache_free(buf_cache, buf);
1522 }
1523
1524 static void
1525 arc_hdr_destroy(arc_buf_hdr_t *hdr)
1526 {
1527         l2arc_buf_hdr_t *l2hdr = hdr->b_l2hdr;
1528
1529         ASSERT(refcount_is_zero(&hdr->b_refcnt));
1530         ASSERT3P(hdr->b_state, ==, arc_anon);
1531         ASSERT(!HDR_IO_IN_PROGRESS(hdr));
1532
1533         if (l2hdr != NULL) {
1534                 boolean_t buflist_held = MUTEX_HELD(&l2arc_buflist_mtx);
1535                 /*
1536                  * To prevent arc_free() and l2arc_evict() from
1537                  * attempting to free the same buffer at the same time,
1538                  * a FREE_IN_PROGRESS flag is given to arc_free() to
1539                  * give it priority.  l2arc_evict() can't destroy this
1540                  * header while we are waiting on l2arc_buflist_mtx.
1541                  *
1542                  * The hdr may be removed from l2ad_buflist before we
1543                  * grab l2arc_buflist_mtx, so b_l2hdr is rechecked.
1544                  */
1545                 if (!buflist_held) {
1546                         mutex_enter(&l2arc_buflist_mtx);
1547                         l2hdr = hdr->b_l2hdr;
1548                 }
1549
1550                 if (l2hdr != NULL) {
1551                         list_remove(l2hdr->b_dev->l2ad_buflist, hdr);
1552                         ARCSTAT_INCR(arcstat_l2_size, -hdr->b_size);
1553                         ARCSTAT_INCR(arcstat_l2_asize, -l2hdr->b_asize);
1554                         kmem_free(l2hdr, sizeof (l2arc_buf_hdr_t));
1555                         arc_space_return(L2HDR_SIZE, ARC_SPACE_L2HDRS);
1556                         if (hdr->b_state == arc_l2c_only)
1557                                 l2arc_hdr_stat_remove();
1558                         hdr->b_l2hdr = NULL;
1559                 }
1560
1561                 if (!buflist_held)
1562                         mutex_exit(&l2arc_buflist_mtx);
1563         }
1564
1565         if (!BUF_EMPTY(hdr)) {
1566                 ASSERT(!HDR_IN_HASH_TABLE(hdr));
1567                 buf_discard_identity(hdr);
1568         }
1569         while (hdr->b_buf) {
1570                 arc_buf_t *buf = hdr->b_buf;
1571
1572                 if (buf->b_efunc) {
1573                         mutex_enter(&arc_eviction_mtx);
1574                         mutex_enter(&buf->b_evict_lock);
1575                         ASSERT(buf->b_hdr != NULL);
1576                         arc_buf_destroy(hdr->b_buf, FALSE, FALSE);
1577                         hdr->b_buf = buf->b_next;
1578                         buf->b_hdr = &arc_eviction_hdr;
1579                         buf->b_next = arc_eviction_list;
1580                         arc_eviction_list = buf;
1581                         mutex_exit(&buf->b_evict_lock);
1582                         mutex_exit(&arc_eviction_mtx);
1583                 } else {
1584                         arc_buf_destroy(hdr->b_buf, FALSE, TRUE);
1585                 }
1586         }
1587         if (hdr->b_freeze_cksum != NULL) {
1588                 kmem_free(hdr->b_freeze_cksum, sizeof (zio_cksum_t));
1589                 hdr->b_freeze_cksum = NULL;
1590         }
1591
1592         ASSERT(!list_link_active(&hdr->b_arc_node));
1593         ASSERT3P(hdr->b_hash_next, ==, NULL);
1594         ASSERT3P(hdr->b_acb, ==, NULL);
1595         kmem_cache_free(hdr_cache, hdr);
1596 }
1597
1598 void
1599 arc_buf_free(arc_buf_t *buf, void *tag)
1600 {
1601         arc_buf_hdr_t *hdr = buf->b_hdr;
1602         int hashed = hdr->b_state != arc_anon;
1603
1604         ASSERT(buf->b_efunc == NULL);
1605         ASSERT(buf->b_data != NULL);
1606
1607         if (hashed) {
1608                 kmutex_t *hash_lock = HDR_LOCK(hdr);
1609
1610                 mutex_enter(hash_lock);
1611                 hdr = buf->b_hdr;
1612                 ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
1613
1614                 (void) remove_reference(hdr, hash_lock, tag);
1615                 if (hdr->b_datacnt > 1) {
1616                         arc_buf_destroy(buf, FALSE, TRUE);
1617                 } else {
1618                         ASSERT(buf == hdr->b_buf);
1619                         ASSERT(buf->b_efunc == NULL);
1620                         hdr->b_flags |= ARC_BUF_AVAILABLE;
1621                 }
1622                 mutex_exit(hash_lock);
1623         } else if (HDR_IO_IN_PROGRESS(hdr)) {
1624                 int destroy_hdr;
1625                 /*
1626                  * We are in the middle of an async write.  Don't destroy
1627                  * this buffer unless the write completes before we finish
1628                  * decrementing the reference count.
1629                  */
1630                 mutex_enter(&arc_eviction_mtx);
1631                 (void) remove_reference(hdr, NULL, tag);
1632                 ASSERT(refcount_is_zero(&hdr->b_refcnt));
1633                 destroy_hdr = !HDR_IO_IN_PROGRESS(hdr);
1634                 mutex_exit(&arc_eviction_mtx);
1635                 if (destroy_hdr)
1636                         arc_hdr_destroy(hdr);
1637         } else {
1638                 if (remove_reference(hdr, NULL, tag) > 0)
1639                         arc_buf_destroy(buf, FALSE, TRUE);
1640                 else
1641                         arc_hdr_destroy(hdr);
1642         }
1643 }
1644
1645 int
1646 arc_buf_remove_ref(arc_buf_t *buf, void* tag)
1647 {
1648         arc_buf_hdr_t *hdr = buf->b_hdr;
1649         kmutex_t *hash_lock = NULL;
1650         int no_callback = (buf->b_efunc == NULL);
1651
1652         if (hdr->b_state == arc_anon) {
1653                 ASSERT(hdr->b_datacnt == 1);
1654                 arc_buf_free(buf, tag);
1655                 return (no_callback);
1656         }
1657
1658         hash_lock = HDR_LOCK(hdr);
1659         mutex_enter(hash_lock);
1660         hdr = buf->b_hdr;
1661         ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
1662         ASSERT(hdr->b_state != arc_anon);
1663         ASSERT(buf->b_data != NULL);
1664
1665         (void) remove_reference(hdr, hash_lock, tag);
1666         if (hdr->b_datacnt > 1) {
1667                 if (no_callback)
1668                         arc_buf_destroy(buf, FALSE, TRUE);
1669         } else if (no_callback) {
1670                 ASSERT(hdr->b_buf == buf && buf->b_next == NULL);
1671                 ASSERT(buf->b_efunc == NULL);
1672                 hdr->b_flags |= ARC_BUF_AVAILABLE;
1673         }
1674         ASSERT(no_callback || hdr->b_datacnt > 1 ||
1675             refcount_is_zero(&hdr->b_refcnt));
1676         mutex_exit(hash_lock);
1677         return (no_callback);
1678 }
1679
1680 int
1681 arc_buf_size(arc_buf_t *buf)
1682 {
1683         return (buf->b_hdr->b_size);
1684 }
1685
1686 /*
1687  * Called from the DMU to determine if the current buffer should be
1688  * evicted. In order to ensure proper locking, the eviction must be initiated
1689  * from the DMU. Return true if the buffer is associated with user data and
1690  * duplicate buffers still exist.
1691  */
1692 boolean_t
1693 arc_buf_eviction_needed(arc_buf_t *buf)
1694 {
1695         arc_buf_hdr_t *hdr;
1696         boolean_t evict_needed = B_FALSE;
1697
1698         if (zfs_disable_dup_eviction)
1699                 return (B_FALSE);
1700
1701         mutex_enter(&buf->b_evict_lock);
1702         hdr = buf->b_hdr;
1703         if (hdr == NULL) {
1704                 /*
1705                  * We are in arc_do_user_evicts(); let that function
1706                  * perform the eviction.
1707                  */
1708                 ASSERT(buf->b_data == NULL);
1709                 mutex_exit(&buf->b_evict_lock);
1710                 return (B_FALSE);
1711         } else if (buf->b_data == NULL) {
1712                 /*
1713                  * We have already been added to the arc eviction list;
1714                  * recommend eviction.
1715                  */
1716                 ASSERT3P(hdr, ==, &arc_eviction_hdr);
1717                 mutex_exit(&buf->b_evict_lock);
1718                 return (B_TRUE);
1719         }
1720
1721         if (hdr->b_datacnt > 1 && hdr->b_type == ARC_BUFC_DATA)
1722                 evict_needed = B_TRUE;
1723
1724         mutex_exit(&buf->b_evict_lock);
1725         return (evict_needed);
1726 }
1727
1728 /*
1729  * Evict buffers from list until we've removed the specified number of
1730  * bytes.  Move the removed buffers to the appropriate evict state.
1731  * If the recycle flag is set, then attempt to "recycle" a buffer:
1732  * - look for a buffer to evict that is `bytes' long.
1733  * - return the data block from this buffer rather than freeing it.
1734  * This flag is used by callers that are trying to make space for a
1735  * new buffer in a full arc cache.
1736  *
1737  * This function makes a "best effort".  It skips over any buffers
1738  * it can't get a hash_lock on, and so may not catch all candidates.
1739  * It may also return without evicting as much space as requested.
1740  */
1741 static void *
1742 arc_evict(arc_state_t *state, uint64_t spa, int64_t bytes, boolean_t recycle,
1743     arc_buf_contents_t type)
1744 {
1745         arc_state_t *evicted_state;
1746         uint64_t bytes_evicted = 0, skipped = 0, missed = 0;
1747         arc_buf_hdr_t *ab, *ab_prev = NULL;
1748         list_t *list = &state->arcs_list[type];
1749         kmutex_t *hash_lock;
1750         boolean_t have_lock;
1751         void *stolen = NULL;
1752
1753         ASSERT(state == arc_mru || state == arc_mfu);
1754
1755         evicted_state = (state == arc_mru) ? arc_mru_ghost : arc_mfu_ghost;
1756
1757         mutex_enter(&state->arcs_mtx);
1758         mutex_enter(&evicted_state->arcs_mtx);
1759
1760         for (ab = list_tail(list); ab; ab = ab_prev) {
1761                 ab_prev = list_prev(list, ab);
1762                 /* prefetch buffers have a minimum lifespan */
1763                 if (HDR_IO_IN_PROGRESS(ab) ||
1764                     (spa && ab->b_spa != spa) ||
1765                     (ab->b_flags & (ARC_PREFETCH|ARC_INDIRECT) &&
1766                     ddi_get_lbolt() - ab->b_arc_access <
1767                     zfs_arc_min_prefetch_lifespan)) {
1768                         skipped++;
1769                         continue;
1770                 }
1771                 /* "lookahead" for better eviction candidate */
1772                 if (recycle && ab->b_size != bytes &&
1773                     ab_prev && ab_prev->b_size == bytes)
1774                         continue;
1775                 hash_lock = HDR_LOCK(ab);
1776                 have_lock = MUTEX_HELD(hash_lock);
1777                 if (have_lock || mutex_tryenter(hash_lock)) {
1778                         ASSERT0(refcount_count(&ab->b_refcnt));
1779                         ASSERT(ab->b_datacnt > 0);
1780                         while (ab->b_buf) {
1781                                 arc_buf_t *buf = ab->b_buf;
1782                                 if (!mutex_tryenter(&buf->b_evict_lock)) {
1783                                         missed += 1;
1784                                         break;
1785                                 }
1786                                 if (buf->b_data) {
1787                                         bytes_evicted += ab->b_size;
1788                                         if (recycle && ab->b_type == type &&
1789                                             ab->b_size == bytes &&
1790                                             !HDR_L2_WRITING(ab)) {
1791                                                 stolen = buf->b_data;
1792                                                 recycle = FALSE;
1793                                         }
1794                                 }
1795                                 if (buf->b_efunc) {
1796                                         mutex_enter(&arc_eviction_mtx);
1797                                         arc_buf_destroy(buf,
1798                                             buf->b_data == stolen, FALSE);
1799                                         ab->b_buf = buf->b_next;
1800                                         buf->b_hdr = &arc_eviction_hdr;
1801                                         buf->b_next = arc_eviction_list;
1802                                         arc_eviction_list = buf;
1803                                         mutex_exit(&arc_eviction_mtx);
1804                                         mutex_exit(&buf->b_evict_lock);
1805                                 } else {
1806                                         mutex_exit(&buf->b_evict_lock);
1807                                         arc_buf_destroy(buf,
1808                                             buf->b_data == stolen, TRUE);
1809                                 }
1810                         }
1811
1812                         if (ab->b_l2hdr) {
1813                                 ARCSTAT_INCR(arcstat_evict_l2_cached,
1814                                     ab->b_size);
1815                         } else {
1816                                 if (l2arc_write_eligible(ab->b_spa, ab)) {
1817                                         ARCSTAT_INCR(arcstat_evict_l2_eligible,
1818                                             ab->b_size);
1819                                 } else {
1820                                         ARCSTAT_INCR(
1821                                             arcstat_evict_l2_ineligible,
1822                                             ab->b_size);
1823                                 }
1824                         }
1825
1826                         if (ab->b_datacnt == 0) {
1827                                 arc_change_state(evicted_state, ab, hash_lock);
1828                                 ASSERT(HDR_IN_HASH_TABLE(ab));
1829                                 ab->b_flags |= ARC_IN_HASH_TABLE;
1830                                 ab->b_flags &= ~ARC_BUF_AVAILABLE;
1831                                 DTRACE_PROBE1(arc__evict, arc_buf_hdr_t *, ab);
1832                         }
1833                         if (!have_lock)
1834                                 mutex_exit(hash_lock);
1835                         if (bytes >= 0 && bytes_evicted >= bytes)
1836                                 break;
1837                 } else {
1838                         missed += 1;
1839                 }
1840         }
1841
1842         mutex_exit(&evicted_state->arcs_mtx);
1843         mutex_exit(&state->arcs_mtx);
1844
1845         if (bytes_evicted < bytes)
1846                 dprintf("only evicted %lld bytes from %x\n",
1847                     (longlong_t)bytes_evicted, state);
1848
1849         if (skipped)
1850                 ARCSTAT_INCR(arcstat_evict_skip, skipped);
1851
1852         if (missed)
1853                 ARCSTAT_INCR(arcstat_mutex_miss, missed);
1854
1855         /*
1856          * We have just evicted some date into the ghost state, make
1857          * sure we also adjust the ghost state size if necessary.
1858          */
1859         if (arc_no_grow &&
1860             arc_mru_ghost->arcs_size + arc_mfu_ghost->arcs_size > arc_c) {
1861                 int64_t mru_over = arc_anon->arcs_size + arc_mru->arcs_size +
1862                     arc_mru_ghost->arcs_size - arc_c;
1863
1864                 if (mru_over > 0 && arc_mru_ghost->arcs_lsize[type] > 0) {
1865                         int64_t todelete =
1866                             MIN(arc_mru_ghost->arcs_lsize[type], mru_over);
1867                         arc_evict_ghost(arc_mru_ghost, 0, todelete);
1868                 } else if (arc_mfu_ghost->arcs_lsize[type] > 0) {
1869                         int64_t todelete = MIN(arc_mfu_ghost->arcs_lsize[type],
1870                             arc_mru_ghost->arcs_size +
1871                             arc_mfu_ghost->arcs_size - arc_c);
1872                         arc_evict_ghost(arc_mfu_ghost, 0, todelete);
1873                 }
1874         }
1875
1876         return (stolen);
1877 }
1878
1879 /*
1880  * Remove buffers from list until we've removed the specified number of
1881  * bytes.  Destroy the buffers that are removed.
1882  */
1883 static void
1884 arc_evict_ghost(arc_state_t *state, uint64_t spa, int64_t bytes)
1885 {
1886         arc_buf_hdr_t *ab, *ab_prev;
1887         arc_buf_hdr_t marker;
1888         list_t *list = &state->arcs_list[ARC_BUFC_DATA];
1889         kmutex_t *hash_lock;
1890         uint64_t bytes_deleted = 0;
1891         uint64_t bufs_skipped = 0;
1892
1893         ASSERT(GHOST_STATE(state));
1894         bzero(&marker, sizeof(marker));
1895 top:
1896         mutex_enter(&state->arcs_mtx);
1897         for (ab = list_tail(list); ab; ab = ab_prev) {
1898                 ab_prev = list_prev(list, ab);
1899                 if (spa && ab->b_spa != spa)
1900                         continue;
1901
1902                 /* ignore markers */
1903                 if (ab->b_spa == 0)
1904                         continue;
1905
1906                 hash_lock = HDR_LOCK(ab);
1907                 /* caller may be trying to modify this buffer, skip it */
1908                 if (MUTEX_HELD(hash_lock))
1909                         continue;
1910                 if (mutex_tryenter(hash_lock)) {
1911                         ASSERT(!HDR_IO_IN_PROGRESS(ab));
1912                         ASSERT(ab->b_buf == NULL);
1913                         ARCSTAT_BUMP(arcstat_deleted);
1914                         bytes_deleted += ab->b_size;
1915
1916                         if (ab->b_l2hdr != NULL) {
1917                                 /*
1918                                  * This buffer is cached on the 2nd Level ARC;
1919                                  * don't destroy the header.
1920                                  */
1921                                 arc_change_state(arc_l2c_only, ab, hash_lock);
1922                                 mutex_exit(hash_lock);
1923                         } else {
1924                                 arc_change_state(arc_anon, ab, hash_lock);
1925                                 mutex_exit(hash_lock);
1926                                 arc_hdr_destroy(ab);
1927                         }
1928
1929                         DTRACE_PROBE1(arc__delete, arc_buf_hdr_t *, ab);
1930                         if (bytes >= 0 && bytes_deleted >= bytes)
1931                                 break;
1932                 } else if (bytes < 0) {
1933                         /*
1934                          * Insert a list marker and then wait for the
1935                          * hash lock to become available. Once its
1936                          * available, restart from where we left off.
1937                          */
1938                         list_insert_after(list, ab, &marker);
1939                         mutex_exit(&state->arcs_mtx);
1940                         mutex_enter(hash_lock);
1941                         mutex_exit(hash_lock);
1942                         mutex_enter(&state->arcs_mtx);
1943                         ab_prev = list_prev(list, &marker);
1944                         list_remove(list, &marker);
1945                 } else
1946                         bufs_skipped += 1;
1947         }
1948         mutex_exit(&state->arcs_mtx);
1949
1950         if (list == &state->arcs_list[ARC_BUFC_DATA] &&
1951             (bytes < 0 || bytes_deleted < bytes)) {
1952                 list = &state->arcs_list[ARC_BUFC_METADATA];
1953                 goto top;
1954         }
1955
1956         if (bufs_skipped) {
1957                 ARCSTAT_INCR(arcstat_mutex_miss, bufs_skipped);
1958                 ASSERT(bytes >= 0);
1959         }
1960
1961         if (bytes_deleted < bytes)
1962                 dprintf("only deleted %lld bytes from %p\n",
1963                     (longlong_t)bytes_deleted, state);
1964 }
1965
1966 static void
1967 arc_adjust(void)
1968 {
1969         int64_t adjustment, delta;
1970
1971         /*
1972          * Adjust MRU size
1973          */
1974
1975         adjustment = MIN((int64_t)(arc_size - arc_c),
1976             (int64_t)(arc_anon->arcs_size + arc_mru->arcs_size + arc_meta_used -
1977             arc_p));
1978
1979         if (adjustment > 0 && arc_mru->arcs_lsize[ARC_BUFC_DATA] > 0) {
1980                 delta = MIN(arc_mru->arcs_lsize[ARC_BUFC_DATA], adjustment);
1981                 (void) arc_evict(arc_mru, 0, delta, FALSE, ARC_BUFC_DATA);
1982                 adjustment -= delta;
1983         }
1984
1985         if (adjustment > 0 && arc_mru->arcs_lsize[ARC_BUFC_METADATA] > 0) {
1986                 delta = MIN(arc_mru->arcs_lsize[ARC_BUFC_METADATA], adjustment);
1987                 (void) arc_evict(arc_mru, 0, delta, FALSE,
1988                     ARC_BUFC_METADATA);
1989         }
1990
1991         /*
1992          * Adjust MFU size
1993          */
1994
1995         adjustment = arc_size - arc_c;
1996
1997         if (adjustment > 0 && arc_mfu->arcs_lsize[ARC_BUFC_DATA] > 0) {
1998                 delta = MIN(adjustment, arc_mfu->arcs_lsize[ARC_BUFC_DATA]);
1999                 (void) arc_evict(arc_mfu, 0, delta, FALSE, ARC_BUFC_DATA);
2000                 adjustment -= delta;
2001         }
2002
2003         if (adjustment > 0 && arc_mfu->arcs_lsize[ARC_BUFC_METADATA] > 0) {
2004                 int64_t delta = MIN(adjustment,
2005                     arc_mfu->arcs_lsize[ARC_BUFC_METADATA]);
2006                 (void) arc_evict(arc_mfu, 0, delta, FALSE,
2007                     ARC_BUFC_METADATA);
2008         }
2009
2010         /*
2011          * Adjust ghost lists
2012          */
2013
2014         adjustment = arc_mru->arcs_size + arc_mru_ghost->arcs_size - arc_c;
2015
2016         if (adjustment > 0 && arc_mru_ghost->arcs_size > 0) {
2017                 delta = MIN(arc_mru_ghost->arcs_size, adjustment);
2018                 arc_evict_ghost(arc_mru_ghost, 0, delta);
2019         }
2020
2021         adjustment =
2022             arc_mru_ghost->arcs_size + arc_mfu_ghost->arcs_size - arc_c;
2023
2024         if (adjustment > 0 && arc_mfu_ghost->arcs_size > 0) {
2025                 delta = MIN(arc_mfu_ghost->arcs_size, adjustment);
2026                 arc_evict_ghost(arc_mfu_ghost, 0, delta);
2027         }
2028 }
2029
2030 /*
2031  * Request that arc user drop references so that N bytes can be released
2032  * from the cache.  This provides a mechanism to ensure the arc can honor
2033  * the arc_meta_limit and reclaim buffers which are pinned in the cache
2034  * by higher layers.  (i.e. the zpl)
2035  */
2036 static void
2037 arc_do_user_prune(int64_t adjustment)
2038 {
2039         arc_prune_func_t *func;
2040         void *private;
2041         arc_prune_t *cp, *np;
2042
2043         mutex_enter(&arc_prune_mtx);
2044
2045         cp = list_head(&arc_prune_list);
2046         while (cp != NULL) {
2047                 func = cp->p_pfunc;
2048                 private = cp->p_private;
2049                 np = list_next(&arc_prune_list, cp);
2050                 refcount_add(&cp->p_refcnt, func);
2051                 mutex_exit(&arc_prune_mtx);
2052
2053                 if (func != NULL)
2054                         func(adjustment, private);
2055
2056                 mutex_enter(&arc_prune_mtx);
2057
2058                 /* User removed prune callback concurrently with execution */
2059                 if (refcount_remove(&cp->p_refcnt, func) == 0) {
2060                         ASSERT(!list_link_active(&cp->p_node));
2061                         refcount_destroy(&cp->p_refcnt);
2062                         kmem_free(cp, sizeof (*cp));
2063                 }
2064
2065                 cp = np;
2066         }
2067
2068         ARCSTAT_BUMP(arcstat_prune);
2069         mutex_exit(&arc_prune_mtx);
2070 }
2071
2072 static void
2073 arc_do_user_evicts(void)
2074 {
2075         mutex_enter(&arc_eviction_mtx);
2076         while (arc_eviction_list != NULL) {
2077                 arc_buf_t *buf = arc_eviction_list;
2078                 arc_eviction_list = buf->b_next;
2079                 mutex_enter(&buf->b_evict_lock);
2080                 buf->b_hdr = NULL;
2081                 mutex_exit(&buf->b_evict_lock);
2082                 mutex_exit(&arc_eviction_mtx);
2083
2084                 if (buf->b_efunc != NULL)
2085                         VERIFY(buf->b_efunc(buf) == 0);
2086
2087                 buf->b_efunc = NULL;
2088                 buf->b_private = NULL;
2089                 kmem_cache_free(buf_cache, buf);
2090                 mutex_enter(&arc_eviction_mtx);
2091         }
2092         mutex_exit(&arc_eviction_mtx);
2093 }
2094
2095 /*
2096  * Evict only meta data objects from the cache leaving the data objects.
2097  * This is only used to enforce the tunable arc_meta_limit, if we are
2098  * unable to evict enough buffers notify the user via the prune callback.
2099  */
2100 void
2101 arc_adjust_meta(int64_t adjustment, boolean_t may_prune)
2102 {
2103         int64_t delta;
2104
2105         if (adjustment > 0 && arc_mru->arcs_lsize[ARC_BUFC_METADATA] > 0) {
2106                 delta = MIN(arc_mru->arcs_lsize[ARC_BUFC_METADATA], adjustment);
2107                 arc_evict(arc_mru, 0, delta, FALSE, ARC_BUFC_METADATA);
2108                 adjustment -= delta;
2109         }
2110
2111         if (adjustment > 0 && arc_mfu->arcs_lsize[ARC_BUFC_METADATA] > 0) {
2112                 delta = MIN(arc_mfu->arcs_lsize[ARC_BUFC_METADATA], adjustment);
2113                 arc_evict(arc_mfu, 0, delta, FALSE, ARC_BUFC_METADATA);
2114                 adjustment -= delta;
2115         }
2116
2117         if (may_prune && (adjustment > 0) && (arc_meta_used > arc_meta_limit))
2118                 arc_do_user_prune(zfs_arc_meta_prune);
2119 }
2120
2121 /*
2122  * Flush all *evictable* data from the cache for the given spa.
2123  * NOTE: this will not touch "active" (i.e. referenced) data.
2124  */
2125 void
2126 arc_flush(spa_t *spa)
2127 {
2128         uint64_t guid = 0;
2129
2130         if (spa)
2131                 guid = spa_load_guid(spa);
2132
2133         while (list_head(&arc_mru->arcs_list[ARC_BUFC_DATA])) {
2134                 (void) arc_evict(arc_mru, guid, -1, FALSE, ARC_BUFC_DATA);
2135                 if (spa)
2136                         break;
2137         }
2138         while (list_head(&arc_mru->arcs_list[ARC_BUFC_METADATA])) {
2139                 (void) arc_evict(arc_mru, guid, -1, FALSE, ARC_BUFC_METADATA);
2140                 if (spa)
2141                         break;
2142         }
2143         while (list_head(&arc_mfu->arcs_list[ARC_BUFC_DATA])) {
2144                 (void) arc_evict(arc_mfu, guid, -1, FALSE, ARC_BUFC_DATA);
2145                 if (spa)
2146                         break;
2147         }
2148         while (list_head(&arc_mfu->arcs_list[ARC_BUFC_METADATA])) {
2149                 (void) arc_evict(arc_mfu, guid, -1, FALSE, ARC_BUFC_METADATA);
2150                 if (spa)
2151                         break;
2152         }
2153
2154         arc_evict_ghost(arc_mru_ghost, guid, -1);
2155         arc_evict_ghost(arc_mfu_ghost, guid, -1);
2156
2157         mutex_enter(&arc_reclaim_thr_lock);
2158         arc_do_user_evicts();
2159         mutex_exit(&arc_reclaim_thr_lock);
2160         ASSERT(spa || arc_eviction_list == NULL);
2161 }
2162
2163 void
2164 arc_shrink(uint64_t bytes)
2165 {
2166         if (arc_c > arc_c_min) {
2167                 uint64_t to_free;
2168
2169                 to_free = bytes ? bytes : arc_c >> zfs_arc_shrink_shift;
2170
2171                 if (arc_c > arc_c_min + to_free)
2172                         atomic_add_64(&arc_c, -to_free);
2173                 else
2174                         arc_c = arc_c_min;
2175
2176                 atomic_add_64(&arc_p, -(arc_p >> zfs_arc_shrink_shift));
2177                 if (arc_c > arc_size)
2178                         arc_c = MAX(arc_size, arc_c_min);
2179                 if (arc_p > arc_c)
2180                         arc_p = (arc_c >> 1);
2181                 ASSERT(arc_c >= arc_c_min);
2182                 ASSERT((int64_t)arc_p >= 0);
2183         }
2184
2185         if (arc_size > arc_c)
2186                 arc_adjust();
2187 }
2188
2189 static void
2190 arc_kmem_reap_now(arc_reclaim_strategy_t strat, uint64_t bytes)
2191 {
2192         size_t                  i;
2193         kmem_cache_t            *prev_cache = NULL;
2194         kmem_cache_t            *prev_data_cache = NULL;
2195         extern kmem_cache_t     *zio_buf_cache[];
2196         extern kmem_cache_t     *zio_data_buf_cache[];
2197
2198         /*
2199          * An aggressive reclamation will shrink the cache size as well as
2200          * reap free buffers from the arc kmem caches.
2201          */
2202         if (strat == ARC_RECLAIM_AGGR)
2203                 arc_shrink(bytes);
2204
2205         for (i = 0; i < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; i++) {
2206                 if (zio_buf_cache[i] != prev_cache) {
2207                         prev_cache = zio_buf_cache[i];
2208                         kmem_cache_reap_now(zio_buf_cache[i]);
2209                 }
2210                 if (zio_data_buf_cache[i] != prev_data_cache) {
2211                         prev_data_cache = zio_data_buf_cache[i];
2212                         kmem_cache_reap_now(zio_data_buf_cache[i]);
2213                 }
2214         }
2215
2216         kmem_cache_reap_now(buf_cache);
2217         kmem_cache_reap_now(hdr_cache);
2218 }
2219
2220 /*
2221  * Unlike other ZFS implementations this thread is only responsible for
2222  * adapting the target ARC size on Linux.  The responsibility for memory
2223  * reclamation has been entirely delegated to the arc_shrinker_func()
2224  * which is registered with the VM.  To reflect this change in behavior
2225  * the arc_reclaim thread has been renamed to arc_adapt.
2226  */
2227 static void
2228 arc_adapt_thread(void)
2229 {
2230         callb_cpr_t             cpr;
2231         int64_t                 prune;
2232
2233         CALLB_CPR_INIT(&cpr, &arc_reclaim_thr_lock, callb_generic_cpr, FTAG);
2234
2235         mutex_enter(&arc_reclaim_thr_lock);
2236         while (arc_thread_exit == 0) {
2237 #ifndef _KERNEL
2238                 arc_reclaim_strategy_t  last_reclaim = ARC_RECLAIM_CONS;
2239
2240                 if (spa_get_random(100) == 0) {
2241
2242                         if (arc_no_grow) {
2243                                 if (last_reclaim == ARC_RECLAIM_CONS) {
2244                                         last_reclaim = ARC_RECLAIM_AGGR;
2245                                 } else {
2246                                         last_reclaim = ARC_RECLAIM_CONS;
2247                                 }
2248                         } else {
2249                                 arc_no_grow = TRUE;
2250                                 last_reclaim = ARC_RECLAIM_AGGR;
2251                                 membar_producer();
2252                         }
2253
2254                         /* reset the growth delay for every reclaim */
2255                         arc_grow_time = ddi_get_lbolt()+(zfs_arc_grow_retry * hz);
2256
2257                         arc_kmem_reap_now(last_reclaim, 0);
2258                         arc_warm = B_TRUE;
2259                 }
2260 #endif /* !_KERNEL */
2261
2262                 /* No recent memory pressure allow the ARC to grow. */
2263                 if (arc_no_grow && ddi_get_lbolt() >= arc_grow_time)
2264                         arc_no_grow = FALSE;
2265
2266                 /*
2267                  * Keep meta data usage within limits, arc_shrink() is not
2268                  * used to avoid collapsing the arc_c value when only the
2269                  * arc_meta_limit is being exceeded.
2270                  */
2271                 prune = (int64_t)arc_meta_used - (int64_t)arc_meta_limit;
2272                 if (prune > 0)
2273                         arc_adjust_meta(prune, B_TRUE);
2274
2275                 arc_adjust();
2276
2277                 if (arc_eviction_list != NULL)
2278                         arc_do_user_evicts();
2279
2280                 /* block until needed, or one second, whichever is shorter */
2281                 CALLB_CPR_SAFE_BEGIN(&cpr);
2282                 (void) cv_timedwait_interruptible(&arc_reclaim_thr_cv,
2283                     &arc_reclaim_thr_lock, (ddi_get_lbolt() + hz));
2284                 CALLB_CPR_SAFE_END(&cpr, &arc_reclaim_thr_lock);
2285
2286
2287                 /* Allow the module options to be changed */
2288                 if (zfs_arc_max > 64 << 20 &&
2289                     zfs_arc_max < physmem * PAGESIZE &&
2290                     zfs_arc_max != arc_c_max)
2291                         arc_c_max = zfs_arc_max;
2292
2293                 if (zfs_arc_min > 0 &&
2294                     zfs_arc_min < arc_c_max &&
2295                     zfs_arc_min != arc_c_min)
2296                         arc_c_min = zfs_arc_min;
2297
2298                 if (zfs_arc_meta_limit > 0 &&
2299                     zfs_arc_meta_limit <= arc_c_max &&
2300                     zfs_arc_meta_limit != arc_meta_limit)
2301                         arc_meta_limit = zfs_arc_meta_limit;
2302
2303
2304
2305         }
2306
2307         arc_thread_exit = 0;
2308         cv_broadcast(&arc_reclaim_thr_cv);
2309         CALLB_CPR_EXIT(&cpr);           /* drops arc_reclaim_thr_lock */
2310         thread_exit();
2311 }
2312
2313 #ifdef _KERNEL
2314 /*
2315  * Determine the amount of memory eligible for eviction contained in the
2316  * ARC. All clean data reported by the ghost lists can always be safely
2317  * evicted. Due to arc_c_min, the same does not hold for all clean data
2318  * contained by the regular mru and mfu lists.
2319  *
2320  * In the case of the regular mru and mfu lists, we need to report as
2321  * much clean data as possible, such that evicting that same reported
2322  * data will not bring arc_size below arc_c_min. Thus, in certain
2323  * circumstances, the total amount of clean data in the mru and mfu
2324  * lists might not actually be evictable.
2325  *
2326  * The following two distinct cases are accounted for:
2327  *
2328  * 1. The sum of the amount of dirty data contained by both the mru and
2329  *    mfu lists, plus the ARC's other accounting (e.g. the anon list),
2330  *    is greater than or equal to arc_c_min.
2331  *    (i.e. amount of dirty data >= arc_c_min)
2332  *
2333  *    This is the easy case; all clean data contained by the mru and mfu
2334  *    lists is evictable. Evicting all clean data can only drop arc_size
2335  *    to the amount of dirty data, which is greater than arc_c_min.
2336  *
2337  * 2. The sum of the amount of dirty data contained by both the mru and
2338  *    mfu lists, plus the ARC's other accounting (e.g. the anon list),
2339  *    is less than arc_c_min.
2340  *    (i.e. arc_c_min > amount of dirty data)
2341  *
2342  *    2.1. arc_size is greater than or equal arc_c_min.
2343  *         (i.e. arc_size >= arc_c_min > amount of dirty data)
2344  *
2345  *         In this case, not all clean data from the regular mru and mfu
2346  *         lists is actually evictable; we must leave enough clean data
2347  *         to keep arc_size above arc_c_min. Thus, the maximum amount of
2348  *         evictable data from the two lists combined, is exactly the
2349  *         difference between arc_size and arc_c_min.
2350  *
2351  *    2.2. arc_size is less than arc_c_min
2352  *         (i.e. arc_c_min > arc_size > amount of dirty data)
2353  *
2354  *         In this case, none of the data contained in the mru and mfu
2355  *         lists is evictable, even if it's clean. Since arc_size is
2356  *         already below arc_c_min, evicting any more would only
2357  *         increase this negative difference.
2358  */
2359 static uint64_t
2360 arc_evictable_memory(void) {
2361         uint64_t arc_clean =
2362             arc_mru->arcs_lsize[ARC_BUFC_DATA] +
2363             arc_mru->arcs_lsize[ARC_BUFC_METADATA] +
2364             arc_mfu->arcs_lsize[ARC_BUFC_DATA] +
2365             arc_mfu->arcs_lsize[ARC_BUFC_METADATA];
2366         uint64_t ghost_clean =
2367             arc_mru_ghost->arcs_lsize[ARC_BUFC_DATA] +
2368             arc_mru_ghost->arcs_lsize[ARC_BUFC_METADATA] +
2369             arc_mfu_ghost->arcs_lsize[ARC_BUFC_DATA] +
2370             arc_mfu_ghost->arcs_lsize[ARC_BUFC_METADATA];
2371         uint64_t arc_dirty = MAX((int64_t)arc_size - (int64_t)arc_clean, 0);
2372
2373         if (arc_dirty >= arc_c_min)
2374                 return (ghost_clean + arc_clean);
2375
2376         return (ghost_clean + MAX((int64_t)arc_size - (int64_t)arc_c_min, 0));
2377 }
2378
2379 static int
2380 __arc_shrinker_func(struct shrinker *shrink, struct shrink_control *sc)
2381 {
2382         uint64_t pages;
2383
2384         /* The arc is considered warm once reclaim has occurred */
2385         if (unlikely(arc_warm == B_FALSE))
2386                 arc_warm = B_TRUE;
2387
2388         /* Return the potential number of reclaimable pages */
2389         pages = btop(arc_evictable_memory());
2390         if (sc->nr_to_scan == 0)
2391                 return (pages);
2392
2393         /* Not allowed to perform filesystem reclaim */
2394         if (!(sc->gfp_mask & __GFP_FS))
2395                 return (-1);
2396
2397         /* Reclaim in progress */
2398         if (mutex_tryenter(&arc_reclaim_thr_lock) == 0)
2399                 return (-1);
2400
2401         /*
2402          * Evict the requested number of pages by shrinking arc_c the
2403          * requested amount.  If there is nothing left to evict just
2404          * reap whatever we can from the various arc slabs.
2405          */
2406         if (pages > 0) {
2407                 arc_kmem_reap_now(ARC_RECLAIM_AGGR, ptob(sc->nr_to_scan));
2408         } else {
2409                 arc_kmem_reap_now(ARC_RECLAIM_CONS, ptob(sc->nr_to_scan));
2410         }
2411
2412         /*
2413          * When direct reclaim is observed it usually indicates a rapid
2414          * increase in memory pressure.  This occurs because the kswapd
2415          * threads were unable to asynchronously keep enough free memory
2416          * available.  In this case set arc_no_grow to briefly pause arc
2417          * growth to avoid compounding the memory pressure.
2418          */
2419         if (current_is_kswapd()) {
2420                 ARCSTAT_BUMP(arcstat_memory_indirect_count);
2421         } else {
2422                 arc_no_grow = B_TRUE;
2423                 arc_grow_time = ddi_get_lbolt() + (zfs_arc_grow_retry * hz);
2424                 ARCSTAT_BUMP(arcstat_memory_direct_count);
2425         }
2426
2427         mutex_exit(&arc_reclaim_thr_lock);
2428
2429         return (-1);
2430 }
2431 SPL_SHRINKER_CALLBACK_WRAPPER(arc_shrinker_func);
2432
2433 SPL_SHRINKER_DECLARE(arc_shrinker, arc_shrinker_func, DEFAULT_SEEKS);
2434 #endif /* _KERNEL */
2435
2436 /*
2437  * Adapt arc info given the number of bytes we are trying to add and
2438  * the state that we are comming from.  This function is only called
2439  * when we are adding new content to the cache.
2440  */
2441 static void
2442 arc_adapt(int bytes, arc_state_t *state)
2443 {
2444         int mult;
2445         uint64_t arc_p_min = (arc_c >> zfs_arc_p_min_shift);
2446
2447         if (state == arc_l2c_only)
2448                 return;
2449
2450         ASSERT(bytes > 0);
2451         /*
2452          * Adapt the target size of the MRU list:
2453          *      - if we just hit in the MRU ghost list, then increase
2454          *        the target size of the MRU list.
2455          *      - if we just hit in the MFU ghost list, then increase
2456          *        the target size of the MFU list by decreasing the
2457          *        target size of the MRU list.
2458          */
2459         if (state == arc_mru_ghost) {
2460                 mult = ((arc_mru_ghost->arcs_size >= arc_mfu_ghost->arcs_size) ?
2461                     1 : (arc_mfu_ghost->arcs_size/arc_mru_ghost->arcs_size));
2462                 mult = MIN(mult, 10); /* avoid wild arc_p adjustment */
2463
2464                 arc_p = MIN(arc_c - arc_p_min, arc_p + bytes * mult);
2465         } else if (state == arc_mfu_ghost) {
2466                 uint64_t delta;
2467
2468                 mult = ((arc_mfu_ghost->arcs_size >= arc_mru_ghost->arcs_size) ?
2469                     1 : (arc_mru_ghost->arcs_size/arc_mfu_ghost->arcs_size));
2470                 mult = MIN(mult, 10);
2471
2472                 delta = MIN(bytes * mult, arc_p);
2473                 arc_p = MAX(arc_p_min, arc_p - delta);
2474         }
2475         ASSERT((int64_t)arc_p >= 0);
2476
2477         if (arc_no_grow)
2478                 return;
2479
2480         if (arc_c >= arc_c_max)
2481                 return;
2482
2483         /*
2484          * If we're within (2 * maxblocksize) bytes of the target
2485          * cache size, increment the target cache size
2486          */
2487         if (arc_size > arc_c - (2ULL << SPA_MAXBLOCKSHIFT)) {
2488                 atomic_add_64(&arc_c, (int64_t)bytes);
2489                 if (arc_c > arc_c_max)
2490                         arc_c = arc_c_max;
2491                 else if (state == arc_anon)
2492                         atomic_add_64(&arc_p, (int64_t)bytes);
2493                 if (arc_p > arc_c)
2494                         arc_p = arc_c;
2495         }
2496         ASSERT((int64_t)arc_p >= 0);
2497 }
2498
2499 /*
2500  * Check if the cache has reached its limits and eviction is required
2501  * prior to insert.
2502  */
2503 static int
2504 arc_evict_needed(arc_buf_contents_t type)
2505 {
2506         if (type == ARC_BUFC_METADATA && arc_meta_used >= arc_meta_limit)
2507                 return (1);
2508
2509         if (arc_no_grow)
2510                 return (1);
2511
2512         return (arc_size > arc_c);
2513 }
2514
2515 /*
2516  * The buffer, supplied as the first argument, needs a data block.
2517  * So, if we are at cache max, determine which cache should be victimized.
2518  * We have the following cases:
2519  *
2520  * 1. Insert for MRU, p > sizeof(arc_anon + arc_mru) ->
2521  * In this situation if we're out of space, but the resident size of the MFU is
2522  * under the limit, victimize the MFU cache to satisfy this insertion request.
2523  *
2524  * 2. Insert for MRU, p <= sizeof(arc_anon + arc_mru) ->
2525  * Here, we've used up all of the available space for the MRU, so we need to
2526  * evict from our own cache instead.  Evict from the set of resident MRU
2527  * entries.
2528  *
2529  * 3. Insert for MFU (c - p) > sizeof(arc_mfu) ->
2530  * c minus p represents the MFU space in the cache, since p is the size of the
2531  * cache that is dedicated to the MRU.  In this situation there's still space on
2532  * the MFU side, so the MRU side needs to be victimized.
2533  *
2534  * 4. Insert for MFU (c - p) < sizeof(arc_mfu) ->
2535  * MFU's resident set is consuming more space than it has been allotted.  In
2536  * this situation, we must victimize our own cache, the MFU, for this insertion.
2537  */
2538 static void
2539 arc_get_data_buf(arc_buf_t *buf)
2540 {
2541         arc_state_t             *state = buf->b_hdr->b_state;
2542         uint64_t                size = buf->b_hdr->b_size;
2543         arc_buf_contents_t      type = buf->b_hdr->b_type;
2544
2545         arc_adapt(size, state);
2546
2547         /*
2548          * We have not yet reached cache maximum size,
2549          * just allocate a new buffer.
2550          */
2551         if (!arc_evict_needed(type)) {
2552                 if (type == ARC_BUFC_METADATA) {
2553                         buf->b_data = zio_buf_alloc(size);
2554                         arc_space_consume(size, ARC_SPACE_DATA);
2555                 } else {
2556                         ASSERT(type == ARC_BUFC_DATA);
2557                         buf->b_data = zio_data_buf_alloc(size);
2558                         ARCSTAT_INCR(arcstat_data_size, size);
2559                         atomic_add_64(&arc_size, size);
2560                 }
2561                 goto out;
2562         }
2563
2564         /*
2565          * If we are prefetching from the mfu ghost list, this buffer
2566          * will end up on the mru list; so steal space from there.
2567          */
2568         if (state == arc_mfu_ghost)
2569                 state = buf->b_hdr->b_flags & ARC_PREFETCH ? arc_mru : arc_mfu;
2570         else if (state == arc_mru_ghost)
2571                 state = arc_mru;
2572
2573         if (state == arc_mru || state == arc_anon) {
2574                 uint64_t mru_used = arc_anon->arcs_size + arc_mru->arcs_size;
2575                 state = (arc_mfu->arcs_lsize[type] >= size &&
2576                     arc_p > mru_used) ? arc_mfu : arc_mru;
2577         } else {
2578                 /* MFU cases */
2579                 uint64_t mfu_space = arc_c - arc_p;
2580                 state =  (arc_mru->arcs_lsize[type] >= size &&
2581                     mfu_space > arc_mfu->arcs_size) ? arc_mru : arc_mfu;
2582         }
2583
2584         if ((buf->b_data = arc_evict(state, 0, size, TRUE, type)) == NULL) {
2585                 if (type == ARC_BUFC_METADATA) {
2586                         buf->b_data = zio_buf_alloc(size);
2587                         arc_space_consume(size, ARC_SPACE_DATA);
2588
2589                         /*
2590                          * If we are unable to recycle an existing meta buffer
2591                          * signal the reclaim thread.  It will notify users
2592                          * via the prune callback to drop references.  The
2593                          * prune callback in run in the context of the reclaim
2594                          * thread to avoid deadlocking on the hash_lock.
2595                          */
2596                         cv_signal(&arc_reclaim_thr_cv);
2597                 } else {
2598                         ASSERT(type == ARC_BUFC_DATA);
2599                         buf->b_data = zio_data_buf_alloc(size);
2600                         ARCSTAT_INCR(arcstat_data_size, size);
2601                         atomic_add_64(&arc_size, size);
2602                 }
2603
2604                 ARCSTAT_BUMP(arcstat_recycle_miss);
2605         }
2606         ASSERT(buf->b_data != NULL);
2607 out:
2608         /*
2609          * Update the state size.  Note that ghost states have a
2610          * "ghost size" and so don't need to be updated.
2611          */
2612         if (!GHOST_STATE(buf->b_hdr->b_state)) {
2613                 arc_buf_hdr_t *hdr = buf->b_hdr;
2614
2615                 atomic_add_64(&hdr->b_state->arcs_size, size);
2616                 if (list_link_active(&hdr->b_arc_node)) {
2617                         ASSERT(refcount_is_zero(&hdr->b_refcnt));
2618                         atomic_add_64(&hdr->b_state->arcs_lsize[type], size);
2619                 }
2620                 /*
2621                  * If we are growing the cache, and we are adding anonymous
2622                  * data, and we have outgrown arc_p, update arc_p
2623                  */
2624                 if (arc_size < arc_c && hdr->b_state == arc_anon &&
2625                     arc_anon->arcs_size + arc_mru->arcs_size > arc_p)
2626                         arc_p = MIN(arc_c, arc_p + size);
2627         }
2628 }
2629
2630 /*
2631  * This routine is called whenever a buffer is accessed.
2632  * NOTE: the hash lock is dropped in this function.
2633  */
2634 static void
2635 arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock)
2636 {
2637         clock_t now;
2638
2639         ASSERT(MUTEX_HELD(hash_lock));
2640
2641         if (buf->b_state == arc_anon) {
2642                 /*
2643                  * This buffer is not in the cache, and does not
2644                  * appear in our "ghost" list.  Add the new buffer
2645                  * to the MRU state.
2646                  */
2647
2648                 ASSERT(buf->b_arc_access == 0);
2649                 buf->b_arc_access = ddi_get_lbolt();
2650                 DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, buf);
2651                 arc_change_state(arc_mru, buf, hash_lock);
2652
2653         } else if (buf->b_state == arc_mru) {
2654                 now = ddi_get_lbolt();
2655
2656                 /*
2657                  * If this buffer is here because of a prefetch, then either:
2658                  * - clear the flag if this is a "referencing" read
2659                  *   (any subsequent access will bump this into the MFU state).
2660                  * or
2661                  * - move the buffer to the head of the list if this is
2662                  *   another prefetch (to make it less likely to be evicted).
2663                  */
2664                 if ((buf->b_flags & ARC_PREFETCH) != 0) {
2665                         if (refcount_count(&buf->b_refcnt) == 0) {
2666                                 ASSERT(list_link_active(&buf->b_arc_node));
2667                         } else {
2668                                 buf->b_flags &= ~ARC_PREFETCH;
2669                                 ARCSTAT_BUMP(arcstat_mru_hits);
2670                         }
2671                         buf->b_arc_access = now;
2672                         return;
2673                 }
2674
2675                 /*
2676                  * This buffer has been "accessed" only once so far,
2677                  * but it is still in the cache. Move it to the MFU
2678                  * state.
2679                  */
2680                 if (now > buf->b_arc_access + ARC_MINTIME) {
2681                         /*
2682                          * More than 125ms have passed since we
2683                          * instantiated this buffer.  Move it to the
2684                          * most frequently used state.
2685                          */
2686                         buf->b_arc_access = now;
2687                         DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf);
2688                         arc_change_state(arc_mfu, buf, hash_lock);
2689                 }
2690                 ARCSTAT_BUMP(arcstat_mru_hits);
2691         } else if (buf->b_state == arc_mru_ghost) {
2692                 arc_state_t     *new_state;
2693                 /*
2694                  * This buffer has been "accessed" recently, but
2695                  * was evicted from the cache.  Move it to the
2696                  * MFU state.
2697                  */
2698
2699                 if (buf->b_flags & ARC_PREFETCH) {
2700                         new_state = arc_mru;
2701                         if (refcount_count(&buf->b_refcnt) > 0)
2702                                 buf->b_flags &= ~ARC_PREFETCH;
2703                         DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, buf);
2704                 } else {
2705                         new_state = arc_mfu;
2706                         DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf);
2707                 }
2708
2709                 buf->b_arc_access = ddi_get_lbolt();
2710                 arc_change_state(new_state, buf, hash_lock);
2711
2712                 ARCSTAT_BUMP(arcstat_mru_ghost_hits);
2713         } else if (buf->b_state == arc_mfu) {
2714                 /*
2715                  * This buffer has been accessed more than once and is
2716                  * still in the cache.  Keep it in the MFU state.
2717                  *
2718                  * NOTE: an add_reference() that occurred when we did
2719                  * the arc_read() will have kicked this off the list.
2720                  * If it was a prefetch, we will explicitly move it to
2721                  * the head of the list now.
2722                  */
2723                 if ((buf->b_flags & ARC_PREFETCH) != 0) {
2724                         ASSERT(refcount_count(&buf->b_refcnt) == 0);
2725                         ASSERT(list_link_active(&buf->b_arc_node));
2726                 }
2727                 ARCSTAT_BUMP(arcstat_mfu_hits);
2728                 buf->b_arc_access = ddi_get_lbolt();
2729         } else if (buf->b_state == arc_mfu_ghost) {
2730                 arc_state_t     *new_state = arc_mfu;
2731                 /*
2732                  * This buffer has been accessed more than once but has
2733                  * been evicted from the cache.  Move it back to the
2734                  * MFU state.
2735                  */
2736
2737                 if (buf->b_flags & ARC_PREFETCH) {
2738                         /*
2739                          * This is a prefetch access...
2740                          * move this block back to the MRU state.
2741                          */
2742                         ASSERT0(refcount_count(&buf->b_refcnt));
2743                         new_state = arc_mru;
2744                 }
2745
2746                 buf->b_arc_access = ddi_get_lbolt();
2747                 DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf);
2748                 arc_change_state(new_state, buf, hash_lock);
2749
2750                 ARCSTAT_BUMP(arcstat_mfu_ghost_hits);
2751         } else if (buf->b_state == arc_l2c_only) {
2752                 /*
2753                  * This buffer is on the 2nd Level ARC.
2754                  */
2755
2756                 buf->b_arc_access = ddi_get_lbolt();
2757                 DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf);
2758                 arc_change_state(arc_mfu, buf, hash_lock);
2759         } else {
2760                 ASSERT(!"invalid arc state");
2761         }
2762 }
2763
2764 /* a generic arc_done_func_t which you can use */
2765 /* ARGSUSED */
2766 void
2767 arc_bcopy_func(zio_t *zio, arc_buf_t *buf, void *arg)
2768 {
2769         if (zio == NULL || zio->io_error == 0)
2770                 bcopy(buf->b_data, arg, buf->b_hdr->b_size);
2771         VERIFY(arc_buf_remove_ref(buf, arg) == 1);
2772 }
2773
2774 /* a generic arc_done_func_t */
2775 void
2776 arc_getbuf_func(zio_t *zio, arc_buf_t *buf, void *arg)
2777 {
2778         arc_buf_t **bufp = arg;
2779         if (zio && zio->io_error) {
2780                 VERIFY(arc_buf_remove_ref(buf, arg) == 1);
2781                 *bufp = NULL;
2782         } else {
2783                 *bufp = buf;
2784                 ASSERT(buf->b_data);
2785         }
2786 }
2787
2788 static void
2789 arc_read_done(zio_t *zio)
2790 {
2791         arc_buf_hdr_t   *hdr, *found;
2792         arc_buf_t       *buf;
2793         arc_buf_t       *abuf;  /* buffer we're assigning to callback */
2794         kmutex_t        *hash_lock;
2795         arc_callback_t  *callback_list, *acb;
2796         int             freeable = FALSE;
2797
2798         buf = zio->io_private;
2799         hdr = buf->b_hdr;
2800
2801         /*
2802          * The hdr was inserted into hash-table and removed from lists
2803          * prior to starting I/O.  We should find this header, since
2804          * it's in the hash table, and it should be legit since it's
2805          * not possible to evict it during the I/O.  The only possible
2806          * reason for it not to be found is if we were freed during the
2807          * read.
2808          */
2809         found = buf_hash_find(hdr->b_spa, &hdr->b_dva, hdr->b_birth,
2810             &hash_lock);
2811
2812         ASSERT((found == NULL && HDR_FREED_IN_READ(hdr) && hash_lock == NULL) ||
2813             (found == hdr && DVA_EQUAL(&hdr->b_dva, BP_IDENTITY(zio->io_bp))) ||
2814             (found == hdr && HDR_L2_READING(hdr)));
2815
2816         hdr->b_flags &= ~ARC_L2_EVICTED;
2817         if (l2arc_noprefetch && (hdr->b_flags & ARC_PREFETCH))
2818                 hdr->b_flags &= ~ARC_L2CACHE;
2819
2820         /* byteswap if necessary */
2821         callback_list = hdr->b_acb;
2822         ASSERT(callback_list != NULL);
2823         if (BP_SHOULD_BYTESWAP(zio->io_bp) && zio->io_error == 0) {
2824                 dmu_object_byteswap_t bswap =
2825                     DMU_OT_BYTESWAP(BP_GET_TYPE(zio->io_bp));
2826                 if (BP_GET_LEVEL(zio->io_bp) > 0)
2827                     byteswap_uint64_array(buf->b_data, hdr->b_size);
2828                 else
2829                     dmu_ot_byteswap[bswap].ob_func(buf->b_data, hdr->b_size);
2830         }
2831
2832         arc_cksum_compute(buf, B_FALSE);
2833
2834         if (hash_lock && zio->io_error == 0 && hdr->b_state == arc_anon) {
2835                 /*
2836                  * Only call arc_access on anonymous buffers.  This is because
2837                  * if we've issued an I/O for an evicted buffer, we've already
2838                  * called arc_access (to prevent any simultaneous readers from
2839                  * getting confused).
2840                  */
2841                 arc_access(hdr, hash_lock);
2842         }
2843
2844         /* create copies of the data buffer for the callers */
2845         abuf = buf;
2846         for (acb = callback_list; acb; acb = acb->acb_next) {
2847                 if (acb->acb_done) {
2848                         if (abuf == NULL) {
2849                                 ARCSTAT_BUMP(arcstat_duplicate_reads);
2850                                 abuf = arc_buf_clone(buf);
2851                         }
2852                         acb->acb_buf = abuf;
2853                         abuf = NULL;
2854                 }
2855         }
2856         hdr->b_acb = NULL;
2857         hdr->b_flags &= ~ARC_IO_IN_PROGRESS;
2858         ASSERT(!HDR_BUF_AVAILABLE(hdr));
2859         if (abuf == buf) {
2860                 ASSERT(buf->b_efunc == NULL);
2861                 ASSERT(hdr->b_datacnt == 1);
2862                 hdr->b_flags |= ARC_BUF_AVAILABLE;
2863         }
2864
2865         ASSERT(refcount_is_zero(&hdr->b_refcnt) || callback_list != NULL);
2866
2867         if (zio->io_error != 0) {
2868                 hdr->b_flags |= ARC_IO_ERROR;
2869                 if (hdr->b_state != arc_anon)
2870                         arc_change_state(arc_anon, hdr, hash_lock);
2871                 if (HDR_IN_HASH_TABLE(hdr))
2872                         buf_hash_remove(hdr);
2873                 freeable = refcount_is_zero(&hdr->b_refcnt);
2874         }
2875
2876         /*
2877          * Broadcast before we drop the hash_lock to avoid the possibility
2878          * that the hdr (and hence the cv) might be freed before we get to
2879          * the cv_broadcast().
2880          */
2881         cv_broadcast(&hdr->b_cv);
2882
2883         if (hash_lock) {
2884                 mutex_exit(hash_lock);
2885         } else {
2886                 /*
2887                  * This block was freed while we waited for the read to
2888                  * complete.  It has been removed from the hash table and
2889                  * moved to the anonymous state (so that it won't show up
2890                  * in the cache).
2891                  */
2892                 ASSERT3P(hdr->b_state, ==, arc_anon);
2893                 freeable = refcount_is_zero(&hdr->b_refcnt);
2894         }
2895
2896         /* execute each callback and free its structure */
2897         while ((acb = callback_list) != NULL) {
2898                 if (acb->acb_done)
2899                         acb->acb_done(zio, acb->acb_buf, acb->acb_private);
2900
2901                 if (acb->acb_zio_dummy != NULL) {
2902                         acb->acb_zio_dummy->io_error = zio->io_error;
2903                         zio_nowait(acb->acb_zio_dummy);
2904                 }
2905
2906                 callback_list = acb->acb_next;
2907                 kmem_free(acb, sizeof (arc_callback_t));
2908         }
2909
2910         if (freeable)
2911                 arc_hdr_destroy(hdr);
2912 }
2913
2914 /*
2915  * "Read" the block at the specified DVA (in bp) via the
2916  * cache.  If the block is found in the cache, invoke the provided
2917  * callback immediately and return.  Note that the `zio' parameter
2918  * in the callback will be NULL in this case, since no IO was
2919  * required.  If the block is not in the cache pass the read request
2920  * on to the spa with a substitute callback function, so that the
2921  * requested block will be added to the cache.
2922  *
2923  * If a read request arrives for a block that has a read in-progress,
2924  * either wait for the in-progress read to complete (and return the
2925  * results); or, if this is a read with a "done" func, add a record
2926  * to the read to invoke the "done" func when the read completes,
2927  * and return; or just return.
2928  *
2929  * arc_read_done() will invoke all the requested "done" functions
2930  * for readers of this block.
2931  */
2932 int
2933 arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_done_func_t *done,
2934     void *private, int priority, int zio_flags, uint32_t *arc_flags,
2935     const zbookmark_t *zb)
2936 {
2937         arc_buf_hdr_t *hdr;
2938         arc_buf_t *buf = NULL;
2939         kmutex_t *hash_lock;
2940         zio_t *rzio;
2941         uint64_t guid = spa_load_guid(spa);
2942
2943 top:
2944         hdr = buf_hash_find(guid, BP_IDENTITY(bp), BP_PHYSICAL_BIRTH(bp),
2945             &hash_lock);
2946         if (hdr && hdr->b_datacnt > 0) {
2947
2948                 *arc_flags |= ARC_CACHED;
2949
2950                 if (HDR_IO_IN_PROGRESS(hdr)) {
2951
2952                         if (*arc_flags & ARC_WAIT) {
2953                                 cv_wait(&hdr->b_cv, hash_lock);
2954                                 mutex_exit(hash_lock);
2955                                 goto top;
2956                         }
2957                         ASSERT(*arc_flags & ARC_NOWAIT);
2958
2959                         if (done) {
2960                                 arc_callback_t  *acb = NULL;
2961
2962                                 acb = kmem_zalloc(sizeof (arc_callback_t),
2963                                     KM_PUSHPAGE);
2964                                 acb->acb_done = done;
2965                                 acb->acb_private = private;
2966                                 if (pio != NULL)
2967                                         acb->acb_zio_dummy = zio_null(pio,
2968                                             spa, NULL, NULL, NULL, zio_flags);
2969
2970                                 ASSERT(acb->acb_done != NULL);
2971                                 acb->acb_next = hdr->b_acb;
2972                                 hdr->b_acb = acb;
2973                                 add_reference(hdr, hash_lock, private);
2974                                 mutex_exit(hash_lock);
2975                                 return (0);
2976                         }
2977                         mutex_exit(hash_lock);
2978                         return (0);
2979                 }
2980
2981                 ASSERT(hdr->b_state == arc_mru || hdr->b_state == arc_mfu);
2982
2983                 if (done) {
2984                         add_reference(hdr, hash_lock, private);
2985                         /*
2986                          * If this block is already in use, create a new
2987                          * copy of the data so that we will be guaranteed
2988                          * that arc_release() will always succeed.
2989                          */
2990                         buf = hdr->b_buf;
2991                         ASSERT(buf);
2992                         ASSERT(buf->b_data);
2993                         if (HDR_BUF_AVAILABLE(hdr)) {
2994                                 ASSERT(buf->b_efunc == NULL);
2995                                 hdr->b_flags &= ~ARC_BUF_AVAILABLE;
2996                         } else {
2997                                 buf = arc_buf_clone(buf);
2998                         }
2999
3000                 } else if (*arc_flags & ARC_PREFETCH &&
3001                     refcount_count(&hdr->b_refcnt) == 0) {
3002                         hdr->b_flags |= ARC_PREFETCH;
3003                 }
3004                 DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr);
3005                 arc_access(hdr, hash_lock);
3006                 if (*arc_flags & ARC_L2CACHE)
3007                         hdr->b_flags |= ARC_L2CACHE;
3008                 if (*arc_flags & ARC_L2COMPRESS)
3009                         hdr->b_flags |= ARC_L2COMPRESS;
3010                 mutex_exit(hash_lock);
3011                 ARCSTAT_BUMP(arcstat_hits);
3012                 ARCSTAT_CONDSTAT(!(hdr->b_flags & ARC_PREFETCH),
3013                     demand, prefetch, hdr->b_type != ARC_BUFC_METADATA,
3014                     data, metadata, hits);
3015
3016                 if (done)
3017                         done(NULL, buf, private);
3018         } else {
3019                 uint64_t size = BP_GET_LSIZE(bp);
3020                 arc_callback_t  *acb;
3021                 vdev_t *vd = NULL;
3022                 uint64_t addr = -1;
3023                 boolean_t devw = B_FALSE;
3024
3025                 if (hdr == NULL) {
3026                         /* this block is not in the cache */
3027                         arc_buf_hdr_t   *exists;
3028                         arc_buf_contents_t type = BP_GET_BUFC_TYPE(bp);
3029                         buf = arc_buf_alloc(spa, size, private, type);
3030                         hdr = buf->b_hdr;
3031                         hdr->b_dva = *BP_IDENTITY(bp);
3032                         hdr->b_birth = BP_PHYSICAL_BIRTH(bp);
3033                         hdr->b_cksum0 = bp->blk_cksum.zc_word[0];
3034                         exists = buf_hash_insert(hdr, &hash_lock);
3035                         if (exists) {
3036                                 /* somebody beat us to the hash insert */
3037                                 mutex_exit(hash_lock);
3038                                 buf_discard_identity(hdr);
3039                                 (void) arc_buf_remove_ref(buf, private);
3040                                 goto top; /* restart the IO request */
3041                         }
3042                         /* if this is a prefetch, we don't have a reference */
3043                         if (*arc_flags & ARC_PREFETCH) {
3044                                 (void) remove_reference(hdr, hash_lock,
3045                                     private);
3046                                 hdr->b_flags |= ARC_PREFETCH;
3047                         }
3048                         if (*arc_flags & ARC_L2CACHE)
3049                                 hdr->b_flags |= ARC_L2CACHE;
3050                         if (*arc_flags & ARC_L2COMPRESS)
3051                                 hdr->b_flags |= ARC_L2COMPRESS;
3052                         if (BP_GET_LEVEL(bp) > 0)
3053                                 hdr->b_flags |= ARC_INDIRECT;
3054                 } else {
3055                         /* this block is in the ghost cache */
3056                         ASSERT(GHOST_STATE(hdr->b_state));
3057                         ASSERT(!HDR_IO_IN_PROGRESS(hdr));
3058                         ASSERT0(refcount_count(&hdr->b_refcnt));
3059                         ASSERT(hdr->b_buf == NULL);
3060
3061                         /* if this is a prefetch, we don't have a reference */
3062                         if (*arc_flags & ARC_PREFETCH)
3063                                 hdr->b_flags |= ARC_PREFETCH;
3064                         else
3065                                 add_reference(hdr, hash_lock, private);
3066                         if (*arc_flags & ARC_L2CACHE)
3067                                 hdr->b_flags |= ARC_L2CACHE;
3068                         if (*arc_flags & ARC_L2COMPRESS)
3069                                 hdr->b_flags |= ARC_L2COMPRESS;
3070                         buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE);
3071                         buf->b_hdr = hdr;
3072                         buf->b_data = NULL;
3073                         buf->b_efunc = NULL;
3074                         buf->b_private = NULL;
3075                         buf->b_next = NULL;
3076                         hdr->b_buf = buf;
3077                         ASSERT(hdr->b_datacnt == 0);
3078                         hdr->b_datacnt = 1;
3079                         arc_get_data_buf(buf);
3080                         arc_access(hdr, hash_lock);
3081                 }
3082
3083                 ASSERT(!GHOST_STATE(hdr->b_state));
3084
3085                 acb = kmem_zalloc(sizeof (arc_callback_t), KM_PUSHPAGE);
3086                 acb->acb_done = done;
3087                 acb->acb_private = private;
3088
3089                 ASSERT(hdr->b_acb == NULL);
3090                 hdr->b_acb = acb;
3091                 hdr->b_flags |= ARC_IO_IN_PROGRESS;
3092
3093                 if (HDR_L2CACHE(hdr) && hdr->b_l2hdr != NULL &&
3094                     (vd = hdr->b_l2hdr->b_dev->l2ad_vdev) != NULL) {
3095                         devw = hdr->b_l2hdr->b_dev->l2ad_writing;
3096                         addr = hdr->b_l2hdr->b_daddr;
3097                         /*
3098                          * Lock out device removal.
3099                          */
3100                         if (vdev_is_dead(vd) ||
3101                             !spa_config_tryenter(spa, SCL_L2ARC, vd, RW_READER))
3102                                 vd = NULL;
3103                 }
3104
3105                 mutex_exit(hash_lock);
3106
3107                 ASSERT3U(hdr->b_size, ==, size);
3108                 DTRACE_PROBE4(arc__miss, arc_buf_hdr_t *, hdr, blkptr_t *, bp,
3109                     uint64_t, size, zbookmark_t *, zb);
3110                 ARCSTAT_BUMP(arcstat_misses);
3111                 ARCSTAT_CONDSTAT(!(hdr->b_flags & ARC_PREFETCH),
3112                     demand, prefetch, hdr->b_type != ARC_BUFC_METADATA,
3113                     data, metadata, misses);
3114
3115                 if (vd != NULL && l2arc_ndev != 0 && !(l2arc_norw && devw)) {
3116                         /*
3117                          * Read from the L2ARC if the following are true:
3118                          * 1. The L2ARC vdev was previously cached.
3119                          * 2. This buffer still has L2ARC metadata.
3120                          * 3. This buffer isn't currently writing to the L2ARC.
3121                          * 4. The L2ARC entry wasn't evicted, which may
3122                          *    also have invalidated the vdev.
3123                          * 5. This isn't prefetch and l2arc_noprefetch is set.
3124                          */
3125                         if (hdr->b_l2hdr != NULL &&
3126                             !HDR_L2_WRITING(hdr) && !HDR_L2_EVICTED(hdr) &&
3127                             !(l2arc_noprefetch && HDR_PREFETCH(hdr))) {
3128                                 l2arc_read_callback_t *cb;
3129
3130                                 DTRACE_PROBE1(l2arc__hit, arc_buf_hdr_t *, hdr);
3131                                 ARCSTAT_BUMP(arcstat_l2_hits);
3132
3133                                 cb = kmem_zalloc(sizeof (l2arc_read_callback_t),
3134                                     KM_PUSHPAGE);
3135                                 cb->l2rcb_buf = buf;
3136                                 cb->l2rcb_spa = spa;
3137                                 cb->l2rcb_bp = *bp;
3138                                 cb->l2rcb_zb = *zb;
3139                                 cb->l2rcb_flags = zio_flags;
3140                                 cb->l2rcb_compress = hdr->b_l2hdr->b_compress;
3141
3142                                 /*
3143                                  * l2arc read.  The SCL_L2ARC lock will be
3144                                  * released by l2arc_read_done().
3145                                  * Issue a null zio if the underlying buffer
3146                                  * was squashed to zero size by compression.
3147                                  */
3148                                 if (hdr->b_l2hdr->b_compress ==
3149                                     ZIO_COMPRESS_EMPTY) {
3150                                         rzio = zio_null(pio, spa, vd,
3151                                             l2arc_read_done, cb,
3152                                             zio_flags | ZIO_FLAG_DONT_CACHE |
3153                                             ZIO_FLAG_CANFAIL |
3154                                             ZIO_FLAG_DONT_PROPAGATE |
3155                                             ZIO_FLAG_DONT_RETRY);
3156                                 } else {
3157                                         rzio = zio_read_phys(pio, vd, addr,
3158                                             hdr->b_l2hdr->b_asize,
3159                                             buf->b_data, ZIO_CHECKSUM_OFF,
3160                                             l2arc_read_done, cb, priority,
3161                                             zio_flags | ZIO_FLAG_DONT_CACHE |
3162                                             ZIO_FLAG_CANFAIL |
3163                                             ZIO_FLAG_DONT_PROPAGATE |
3164                                             ZIO_FLAG_DONT_RETRY, B_FALSE);
3165                                 }
3166                                 DTRACE_PROBE2(l2arc__read, vdev_t *, vd,
3167                                     zio_t *, rzio);
3168                                 ARCSTAT_INCR(arcstat_l2_read_bytes,
3169                                     hdr->b_l2hdr->b_asize);
3170
3171                                 if (*arc_flags & ARC_NOWAIT) {
3172                                         zio_nowait(rzio);
3173                                         return (0);
3174                                 }
3175
3176                                 ASSERT(*arc_flags & ARC_WAIT);
3177                                 if (zio_wait(rzio) == 0)
3178                                         return (0);
3179
3180                                 /* l2arc read error; goto zio_read() */
3181                         } else {
3182                                 DTRACE_PROBE1(l2arc__miss,
3183                                     arc_buf_hdr_t *, hdr);
3184                                 ARCSTAT_BUMP(arcstat_l2_misses);
3185                                 if (HDR_L2_WRITING(hdr))
3186                                         ARCSTAT_BUMP(arcstat_l2_rw_clash);
3187                                 spa_config_exit(spa, SCL_L2ARC, vd);
3188                         }
3189                 } else {
3190                         if (vd != NULL)
3191                                 spa_config_exit(spa, SCL_L2ARC, vd);
3192                         if (l2arc_ndev != 0) {
3193                                 DTRACE_PROBE1(l2arc__miss,
3194                                     arc_buf_hdr_t *, hdr);
3195                                 ARCSTAT_BUMP(arcstat_l2_misses);
3196                         }
3197                 }
3198
3199                 rzio = zio_read(pio, spa, bp, buf->b_data, size,
3200                     arc_read_done, buf, priority, zio_flags, zb);
3201
3202                 if (*arc_flags & ARC_WAIT)
3203                         return (zio_wait(rzio));
3204
3205                 ASSERT(*arc_flags & ARC_NOWAIT);
3206                 zio_nowait(rzio);
3207         }
3208         return (0);
3209 }
3210
3211 arc_prune_t *
3212 arc_add_prune_callback(arc_prune_func_t *func, void *private)
3213 {
3214         arc_prune_t *p;
3215
3216         p = kmem_alloc(sizeof(*p), KM_SLEEP);
3217         p->p_pfunc = func;
3218         p->p_private = private;
3219         list_link_init(&p->p_node);
3220         refcount_create(&p->p_refcnt);
3221
3222         mutex_enter(&arc_prune_mtx);
3223         refcount_add(&p->p_refcnt, &arc_prune_list);
3224         list_insert_head(&arc_prune_list, p);
3225         mutex_exit(&arc_prune_mtx);
3226
3227         return (p);
3228 }
3229
3230 void
3231 arc_remove_prune_callback(arc_prune_t *p)
3232 {
3233         mutex_enter(&arc_prune_mtx);
3234         list_remove(&arc_prune_list, p);
3235         if (refcount_remove(&p->p_refcnt, &arc_prune_list) == 0) {
3236                 refcount_destroy(&p->p_refcnt);
3237                 kmem_free(p, sizeof (*p));
3238         }
3239         mutex_exit(&arc_prune_mtx);
3240 }
3241
3242 void
3243 arc_set_callback(arc_buf_t *buf, arc_evict_func_t *func, void *private)
3244 {
3245         ASSERT(buf->b_hdr != NULL);
3246         ASSERT(buf->b_hdr->b_state != arc_anon);
3247         ASSERT(!refcount_is_zero(&buf->b_hdr->b_refcnt) || func == NULL);
3248         ASSERT(buf->b_efunc == NULL);
3249         ASSERT(!HDR_BUF_AVAILABLE(buf->b_hdr));
3250
3251         buf->b_efunc = func;
3252         buf->b_private = private;
3253 }
3254
3255 /*
3256  * Notify the arc that a block was freed, and thus will never be used again.
3257  */
3258 void
3259 arc_freed(spa_t *spa, const blkptr_t *bp)
3260 {
3261         arc_buf_hdr_t *hdr;
3262         kmutex_t *hash_lock;
3263         uint64_t guid = spa_load_guid(spa);
3264
3265         hdr = buf_hash_find(guid, BP_IDENTITY(bp), BP_PHYSICAL_BIRTH(bp),
3266             &hash_lock);
3267         if (hdr == NULL)
3268                 return;
3269         if (HDR_BUF_AVAILABLE(hdr)) {
3270                 arc_buf_t *buf = hdr->b_buf;
3271                 add_reference(hdr, hash_lock, FTAG);
3272                 hdr->b_flags &= ~ARC_BUF_AVAILABLE;
3273                 mutex_exit(hash_lock);
3274
3275                 arc_release(buf, FTAG);
3276                 (void) arc_buf_remove_ref(buf, FTAG);
3277         } else {
3278                 mutex_exit(hash_lock);
3279         }
3280
3281 }
3282
3283 /*
3284  * This is used by the DMU to let the ARC know that a buffer is
3285  * being evicted, so the ARC should clean up.  If this arc buf
3286  * is not yet in the evicted state, it will be put there.
3287  */
3288 int
3289 arc_buf_evict(arc_buf_t *buf)
3290 {
3291         arc_buf_hdr_t *hdr;
3292         kmutex_t *hash_lock;
3293         arc_buf_t **bufp;
3294
3295         mutex_enter(&buf->b_evict_lock);
3296         hdr = buf->b_hdr;
3297         if (hdr == NULL) {
3298                 /*
3299                  * We are in arc_do_user_evicts().
3300                  */
3301                 ASSERT(buf->b_data == NULL);
3302                 mutex_exit(&buf->b_evict_lock);
3303                 return (0);
3304         } else if (buf->b_data == NULL) {
3305                 arc_buf_t copy = *buf; /* structure assignment */
3306                 /*
3307                  * We are on the eviction list; process this buffer now
3308                  * but let arc_do_user_evicts() do the reaping.
3309                  */
3310                 buf->b_efunc = NULL;
3311                 mutex_exit(&buf->b_evict_lock);
3312                 VERIFY(copy.b_efunc(&copy) == 0);
3313                 return (1);
3314         }
3315         hash_lock = HDR_LOCK(hdr);
3316         mutex_enter(hash_lock);
3317         hdr = buf->b_hdr;
3318         ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
3319
3320         ASSERT3U(refcount_count(&hdr->b_refcnt), <, hdr->b_datacnt);
3321         ASSERT(hdr->b_state == arc_mru || hdr->b_state == arc_mfu);
3322
3323         /*
3324          * Pull this buffer off of the hdr
3325          */
3326         bufp = &hdr->b_buf;
3327         while (*bufp != buf)
3328                 bufp = &(*bufp)->b_next;
3329         *bufp = buf->b_next;
3330
3331         ASSERT(buf->b_data != NULL);
3332         arc_buf_destroy(buf, FALSE, FALSE);
3333
3334         if (hdr->b_datacnt == 0) {
3335                 arc_state_t *old_state = hdr->b_state;
3336                 arc_state_t *evicted_state;
3337
3338                 ASSERT(hdr->b_buf == NULL);
3339                 ASSERT(refcount_is_zero(&hdr->b_refcnt));
3340
3341                 evicted_state =
3342                     (old_state == arc_mru) ? arc_mru_ghost : arc_mfu_ghost;
3343
3344                 mutex_enter(&old_state->arcs_mtx);
3345                 mutex_enter(&evicted_state->arcs_mtx);
3346
3347                 arc_change_state(evicted_state, hdr, hash_lock);
3348                 ASSERT(HDR_IN_HASH_TABLE(hdr));
3349                 hdr->b_flags |= ARC_IN_HASH_TABLE;
3350                 hdr->b_flags &= ~ARC_BUF_AVAILABLE;
3351
3352                 mutex_exit(&evicted_state->arcs_mtx);
3353                 mutex_exit(&old_state->arcs_mtx);
3354         }
3355         mutex_exit(hash_lock);
3356         mutex_exit(&buf->b_evict_lock);
3357
3358         VERIFY(buf->b_efunc(buf) == 0);
3359         buf->b_efunc = NULL;
3360         buf->b_private = NULL;
3361         buf->b_hdr = NULL;
3362         buf->b_next = NULL;
3363         kmem_cache_free(buf_cache, buf);
3364         return (1);
3365 }
3366
3367 /*
3368  * Release this buffer from the cache.  This must be done
3369  * after a read and prior to modifying the buffer contents.
3370  * If the buffer has more than one reference, we must make
3371  * a new hdr for the buffer.
3372  */
3373 void
3374 arc_release(arc_buf_t *buf, void *tag)
3375 {
3376         arc_buf_hdr_t *hdr;
3377         kmutex_t *hash_lock = NULL;
3378         l2arc_buf_hdr_t *l2hdr;
3379         uint64_t buf_size = 0;
3380
3381         /*
3382          * It would be nice to assert that if it's DMU metadata (level >
3383          * 0 || it's the dnode file), then it must be syncing context.
3384          * But we don't know that information at this level.
3385          */
3386
3387         mutex_enter(&buf->b_evict_lock);
3388         hdr = buf->b_hdr;
3389
3390         /* this buffer is not on any list */
3391         ASSERT(refcount_count(&hdr->b_refcnt) > 0);
3392
3393         if (hdr->b_state == arc_anon) {
3394                 /* this buffer is already released */
3395                 ASSERT(buf->b_efunc == NULL);
3396         } else {
3397                 hash_lock = HDR_LOCK(hdr);
3398                 mutex_enter(hash_lock);
3399                 hdr = buf->b_hdr;
3400                 ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
3401         }
3402
3403         l2hdr = hdr->b_l2hdr;
3404         if (l2hdr) {
3405                 mutex_enter(&l2arc_buflist_mtx);
3406                 hdr->b_l2hdr = NULL;
3407                 buf_size = hdr->b_size;
3408         }
3409
3410         /*
3411          * Do we have more than one buf?
3412          */
3413         if (hdr->b_datacnt > 1) {
3414                 arc_buf_hdr_t *nhdr;
3415                 arc_buf_t **bufp;
3416                 uint64_t blksz = hdr->b_size;
3417                 uint64_t spa = hdr->b_spa;
3418                 arc_buf_contents_t type = hdr->b_type;
3419                 uint32_t flags = hdr->b_flags;
3420
3421                 ASSERT(hdr->b_buf != buf || buf->b_next != NULL);
3422                 /*
3423                  * Pull the data off of this hdr and attach it to
3424                  * a new anonymous hdr.
3425                  */
3426                 (void) remove_reference(hdr, hash_lock, tag);
3427                 bufp = &hdr->b_buf;
3428                 while (*bufp != buf)
3429                         bufp = &(*bufp)->b_next;
3430                 *bufp = buf->b_next;
3431                 buf->b_next = NULL;
3432
3433                 ASSERT3U(hdr->b_state->arcs_size, >=, hdr->b_size);
3434                 atomic_add_64(&hdr->b_state->arcs_size, -hdr->b_size);
3435                 if (refcount_is_zero(&hdr->b_refcnt)) {
3436                         uint64_t *size = &hdr->b_state->arcs_lsize[hdr->b_type];
3437                         ASSERT3U(*size, >=, hdr->b_size);
3438                         atomic_add_64(size, -hdr->b_size);
3439                 }
3440
3441                 /*
3442                  * We're releasing a duplicate user data buffer, update
3443                  * our statistics accordingly.
3444                  */
3445                 if (hdr->b_type == ARC_BUFC_DATA) {
3446                         ARCSTAT_BUMPDOWN(arcstat_duplicate_buffers);
3447                         ARCSTAT_INCR(arcstat_duplicate_buffers_size,
3448                             -hdr->b_size);
3449                 }
3450                 hdr->b_datacnt -= 1;
3451                 arc_cksum_verify(buf);
3452
3453                 mutex_exit(hash_lock);
3454
3455                 nhdr = kmem_cache_alloc(hdr_cache, KM_PUSHPAGE);
3456                 nhdr->b_size = blksz;
3457                 nhdr->b_spa = spa;
3458                 nhdr->b_type = type;
3459                 nhdr->b_buf = buf;
3460                 nhdr->b_state = arc_anon;
3461                 nhdr->b_arc_access = 0;
3462                 nhdr->b_flags = flags & ARC_L2_WRITING;
3463                 nhdr->b_l2hdr = NULL;
3464                 nhdr->b_datacnt = 1;
3465                 nhdr->b_freeze_cksum = NULL;
3466                 (void) refcount_add(&nhdr->b_refcnt, tag);
3467                 buf->b_hdr = nhdr;
3468                 mutex_exit(&buf->b_evict_lock);
3469                 atomic_add_64(&arc_anon->arcs_size, blksz);
3470         } else {
3471                 mutex_exit(&buf->b_evict_lock);
3472                 ASSERT(refcount_count(&hdr->b_refcnt) == 1);
3473                 ASSERT(!list_link_active(&hdr->b_arc_node));
3474                 ASSERT(!HDR_IO_IN_PROGRESS(hdr));
3475                 if (hdr->b_state != arc_anon)
3476                         arc_change_state(arc_anon, hdr, hash_lock);
3477                 hdr->b_arc_access = 0;
3478                 if (hash_lock)
3479                         mutex_exit(hash_lock);
3480
3481                 buf_discard_identity(hdr);
3482                 arc_buf_thaw(buf);
3483         }
3484         buf->b_efunc = NULL;
3485         buf->b_private = NULL;
3486
3487         if (l2hdr) {
3488                 ARCSTAT_INCR(arcstat_l2_asize, -l2hdr->b_asize);
3489                 list_remove(l2hdr->b_dev->l2ad_buflist, hdr);
3490                 kmem_free(l2hdr, sizeof (l2arc_buf_hdr_t));
3491                 arc_space_return(L2HDR_SIZE, ARC_SPACE_L2HDRS);
3492                 ARCSTAT_INCR(arcstat_l2_size, -buf_size);
3493                 mutex_exit(&l2arc_buflist_mtx);
3494         }
3495 }
3496
3497 int
3498 arc_released(arc_buf_t *buf)
3499 {
3500         int released;
3501
3502         mutex_enter(&buf->b_evict_lock);
3503         released = (buf->b_data != NULL && buf->b_hdr->b_state == arc_anon);
3504         mutex_exit(&buf->b_evict_lock);
3505         return (released);
3506 }
3507
3508 int
3509 arc_has_callback(arc_buf_t *buf)
3510 {
3511         int callback;
3512
3513         mutex_enter(&buf->b_evict_lock);
3514         callback = (buf->b_efunc != NULL);
3515         mutex_exit(&buf->b_evict_lock);
3516         return (callback);
3517 }
3518
3519 #ifdef ZFS_DEBUG
3520 int
3521 arc_referenced(arc_buf_t *buf)
3522 {
3523         int referenced;
3524
3525         mutex_enter(&buf->b_evict_lock);
3526         referenced = (refcount_count(&buf->b_hdr->b_refcnt));
3527         mutex_exit(&buf->b_evict_lock);
3528         return (referenced);
3529 }
3530 #endif
3531
3532 static void
3533 arc_write_ready(zio_t *zio)
3534 {
3535         arc_write_callback_t *callback = zio->io_private;
3536         arc_buf_t *buf = callback->awcb_buf;
3537         arc_buf_hdr_t *hdr = buf->b_hdr;
3538
3539         ASSERT(!refcount_is_zero(&buf->b_hdr->b_refcnt));
3540         callback->awcb_ready(zio, buf, callback->awcb_private);
3541
3542         /*
3543          * If the IO is already in progress, then this is a re-write
3544          * attempt, so we need to thaw and re-compute the cksum.
3545          * It is the responsibility of the callback to handle the
3546          * accounting for any re-write attempt.
3547          */
3548         if (HDR_IO_IN_PROGRESS(hdr)) {
3549                 mutex_enter(&hdr->b_freeze_lock);
3550                 if (hdr->b_freeze_cksum != NULL) {
3551                         kmem_free(hdr->b_freeze_cksum, sizeof (zio_cksum_t));
3552                         hdr->b_freeze_cksum = NULL;
3553                 }
3554                 mutex_exit(&hdr->b_freeze_lock);
3555         }
3556         arc_cksum_compute(buf, B_FALSE);
3557         hdr->b_flags |= ARC_IO_IN_PROGRESS;
3558 }
3559
3560 static void
3561 arc_write_done(zio_t *zio)
3562 {
3563         arc_write_callback_t *callback = zio->io_private;
3564         arc_buf_t *buf = callback->awcb_buf;
3565         arc_buf_hdr_t *hdr = buf->b_hdr;
3566
3567         ASSERT(hdr->b_acb == NULL);
3568
3569         if (zio->io_error == 0) {
3570                 hdr->b_dva = *BP_IDENTITY(zio->io_bp);
3571                 hdr->b_birth = BP_PHYSICAL_BIRTH(zio->io_bp);
3572                 hdr->b_cksum0 = zio->io_bp->blk_cksum.zc_word[0];
3573         } else {
3574                 ASSERT(BUF_EMPTY(hdr));
3575         }
3576
3577         /*
3578          * If the block to be written was all-zero, we may have
3579          * compressed it away.  In this case no write was performed
3580          * so there will be no dva/birth/checksum.  The buffer must
3581          * therefore remain anonymous (and uncached).
3582          */
3583         if (!BUF_EMPTY(hdr)) {
3584                 arc_buf_hdr_t *exists;
3585                 kmutex_t *hash_lock;
3586
3587                 ASSERT(zio->io_error == 0);
3588
3589                 arc_cksum_verify(buf);
3590
3591                 exists = buf_hash_insert(hdr, &hash_lock);
3592                 if (exists) {
3593                         /*
3594                          * This can only happen if we overwrite for
3595                          * sync-to-convergence, because we remove
3596                          * buffers from the hash table when we arc_free().
3597                          */
3598                         if (zio->io_flags & ZIO_FLAG_IO_REWRITE) {
3599                                 if (!BP_EQUAL(&zio->io_bp_orig, zio->io_bp))
3600                                         panic("bad overwrite, hdr=%p exists=%p",
3601                                             (void *)hdr, (void *)exists);
3602                                 ASSERT(refcount_is_zero(&exists->b_refcnt));
3603                                 arc_change_state(arc_anon, exists, hash_lock);
3604                                 mutex_exit(hash_lock);
3605                                 arc_hdr_destroy(exists);
3606                                 exists = buf_hash_insert(hdr, &hash_lock);
3607                                 ASSERT3P(exists, ==, NULL);
3608                         } else {
3609                                 /* Dedup */
3610                                 ASSERT(hdr->b_datacnt == 1);
3611                                 ASSERT(hdr->b_state == arc_anon);
3612                                 ASSERT(BP_GET_DEDUP(zio->io_bp));
3613                                 ASSERT(BP_GET_LEVEL(zio->io_bp) == 0);
3614                         }
3615                 }
3616                 hdr->b_flags &= ~ARC_IO_IN_PROGRESS;
3617                 /* if it's not anon, we are doing a scrub */
3618                 if (!exists && hdr->b_state == arc_anon)
3619                         arc_access(hdr, hash_lock);
3620                 mutex_exit(hash_lock);
3621         } else {
3622                 hdr->b_flags &= ~ARC_IO_IN_PROGRESS;
3623         }
3624
3625         ASSERT(!refcount_is_zero(&hdr->b_refcnt));
3626         callback->awcb_done(zio, buf, callback->awcb_private);
3627
3628         kmem_free(callback, sizeof (arc_write_callback_t));
3629 }
3630
3631 zio_t *
3632 arc_write(zio_t *pio, spa_t *spa, uint64_t txg,
3633     blkptr_t *bp, arc_buf_t *buf, boolean_t l2arc, boolean_t l2arc_compress,
3634     const zio_prop_t *zp, arc_done_func_t *ready, arc_done_func_t *done,
3635     void *private, int priority, int zio_flags, const zbookmark_t *zb)
3636 {
3637         arc_buf_hdr_t *hdr = buf->b_hdr;
3638         arc_write_callback_t *callback;
3639         zio_t *zio;
3640
3641         ASSERT(ready != NULL);
3642         ASSERT(done != NULL);
3643         ASSERT(!HDR_IO_ERROR(hdr));
3644         ASSERT((hdr->b_flags & ARC_IO_IN_PROGRESS) == 0);
3645         ASSERT(hdr->b_acb == NULL);
3646         if (l2arc)
3647                 hdr->b_flags |= ARC_L2CACHE;
3648         if (l2arc_compress)
3649                 hdr->b_flags |= ARC_L2COMPRESS;
3650         callback = kmem_zalloc(sizeof (arc_write_callback_t), KM_PUSHPAGE);
3651         callback->awcb_ready = ready;
3652         callback->awcb_done = done;
3653         callback->awcb_private = private;
3654         callback->awcb_buf = buf;
3655
3656         zio = zio_write(pio, spa, txg, bp, buf->b_data, hdr->b_size, zp,
3657             arc_write_ready, arc_write_done, callback, priority, zio_flags, zb);
3658
3659         return (zio);
3660 }
3661
3662 static int
3663 arc_memory_throttle(uint64_t reserve, uint64_t inflight_data, uint64_t txg)
3664 {
3665 #ifdef _KERNEL
3666         uint64_t available_memory;
3667
3668         if (zfs_arc_memory_throttle_disable)
3669                 return (0);
3670
3671         /* Easily reclaimable memory (free + inactive + arc-evictable) */
3672         available_memory = ptob(spl_kmem_availrmem()) + arc_evictable_memory();
3673
3674         if (available_memory <= zfs_write_limit_max) {
3675                 ARCSTAT_INCR(arcstat_memory_throttle_count, 1);
3676                 DMU_TX_STAT_BUMP(dmu_tx_memory_reclaim);
3677                 return (EAGAIN);
3678         }
3679
3680         if (inflight_data > available_memory / 4) {
3681                 ARCSTAT_INCR(arcstat_memory_throttle_count, 1);
3682                 DMU_TX_STAT_BUMP(dmu_tx_memory_inflight);
3683                 return (ERESTART);
3684         }
3685 #endif
3686         return (0);
3687 }
3688
3689 void
3690 arc_tempreserve_clear(uint64_t reserve)
3691 {
3692         atomic_add_64(&arc_tempreserve, -reserve);
3693         ASSERT((int64_t)arc_tempreserve >= 0);
3694 }
3695
3696 int
3697 arc_tempreserve_space(uint64_t reserve, uint64_t txg)
3698 {
3699         int error;
3700         uint64_t anon_size;
3701
3702 #ifdef ZFS_DEBUG
3703         /*
3704          * Once in a while, fail for no reason.  Everything should cope.
3705          */
3706         if (spa_get_random(10000) == 0) {
3707                 dprintf("forcing random failure\n");
3708                 return (ERESTART);
3709         }
3710 #endif
3711         if (reserve > arc_c/4 && !arc_no_grow)
3712                 arc_c = MIN(arc_c_max, reserve * 4);
3713         if (reserve > arc_c) {
3714                 DMU_TX_STAT_BUMP(dmu_tx_memory_reserve);
3715                 return (ENOMEM);
3716         }
3717
3718         /*
3719          * Don't count loaned bufs as in flight dirty data to prevent long
3720          * network delays from blocking transactions that are ready to be
3721          * assigned to a txg.
3722          */
3723         anon_size = MAX((int64_t)(arc_anon->arcs_size - arc_loaned_bytes), 0);
3724
3725         /*
3726          * Writes will, almost always, require additional memory allocations
3727          * in order to compress/encrypt/etc the data.  We therefor need to
3728          * make sure that there is sufficient available memory for this.
3729          */
3730         if ((error = arc_memory_throttle(reserve, anon_size, txg)))
3731                 return (error);
3732
3733         /*
3734          * Throttle writes when the amount of dirty data in the cache
3735          * gets too large.  We try to keep the cache less than half full
3736          * of dirty blocks so that our sync times don't grow too large.
3737          * Note: if two requests come in concurrently, we might let them
3738          * both succeed, when one of them should fail.  Not a huge deal.
3739          */
3740
3741         if (reserve + arc_tempreserve + anon_size > arc_c / 2 &&
3742             anon_size > arc_c / 4) {
3743                 dprintf("failing, arc_tempreserve=%lluK anon_meta=%lluK "
3744                     "anon_data=%lluK tempreserve=%lluK arc_c=%lluK\n",
3745                     arc_tempreserve>>10,
3746                     arc_anon->arcs_lsize[ARC_BUFC_METADATA]>>10,
3747                     arc_anon->arcs_lsize[ARC_BUFC_DATA]>>10,
3748                     reserve>>10, arc_c>>10);
3749                 DMU_TX_STAT_BUMP(dmu_tx_dirty_throttle);
3750                 return (ERESTART);
3751         }
3752         atomic_add_64(&arc_tempreserve, reserve);
3753         return (0);
3754 }
3755
3756 static void
3757 arc_kstat_update_state(arc_state_t *state, kstat_named_t *size,
3758     kstat_named_t *evict_data, kstat_named_t *evict_metadata)
3759 {
3760         size->value.ui64 = state->arcs_size;
3761         evict_data->value.ui64 = state->arcs_lsize[ARC_BUFC_DATA];
3762         evict_metadata->value.ui64 = state->arcs_lsize[ARC_BUFC_METADATA];
3763 }
3764
3765 static int
3766 arc_kstat_update(kstat_t *ksp, int rw)
3767 {
3768         arc_stats_t *as = ksp->ks_data;
3769
3770         if (rw == KSTAT_WRITE) {
3771                 return (EACCES);
3772         } else {
3773                 arc_kstat_update_state(arc_anon,
3774                     &as->arcstat_anon_size,
3775                     &as->arcstat_anon_evict_data,
3776                     &as->arcstat_anon_evict_metadata);
3777                 arc_kstat_update_state(arc_mru,
3778                     &as->arcstat_mru_size,
3779                     &as->arcstat_mru_evict_data,
3780                     &as->arcstat_mru_evict_metadata);
3781                 arc_kstat_update_state(arc_mru_ghost,
3782                     &as->arcstat_mru_ghost_size,
3783                     &as->arcstat_mru_ghost_evict_data,
3784                     &as->arcstat_mru_ghost_evict_metadata);
3785                 arc_kstat_update_state(arc_mfu,
3786                     &as->arcstat_mfu_size,
3787                     &as->arcstat_mfu_evict_data,
3788                     &as->arcstat_mfu_evict_metadata);
3789                 arc_kstat_update_state(arc_mfu_ghost,
3790                     &as->arcstat_mfu_ghost_size,
3791                     &as->arcstat_mfu_ghost_evict_data,
3792                     &as->arcstat_mfu_ghost_evict_metadata);
3793         }
3794
3795         return (0);
3796 }
3797
3798 void
3799 arc_init(void)
3800 {
3801         mutex_init(&arc_reclaim_thr_lock, NULL, MUTEX_DEFAULT, NULL);
3802         cv_init(&arc_reclaim_thr_cv, NULL, CV_DEFAULT, NULL);
3803
3804         /* Convert seconds to clock ticks */
3805         zfs_arc_min_prefetch_lifespan = 1 * hz;
3806
3807         /* Start out with 1/8 of all memory */
3808         arc_c = physmem * PAGESIZE / 8;
3809
3810 #ifdef _KERNEL
3811         /*
3812          * On architectures where the physical memory can be larger
3813          * than the addressable space (intel in 32-bit mode), we may
3814          * need to limit the cache to 1/8 of VM size.
3815          */
3816         arc_c = MIN(arc_c, vmem_size(heap_arena, VMEM_ALLOC | VMEM_FREE) / 8);
3817         /*
3818          * Register a shrinker to support synchronous (direct) memory
3819          * reclaim from the arc.  This is done to prevent kswapd from
3820          * swapping out pages when it is preferable to shrink the arc.
3821          */
3822         spl_register_shrinker(&arc_shrinker);
3823 #endif
3824
3825         /* set min cache to 1/32 of all memory, or 64MB, whichever is more */
3826         arc_c_min = MAX(arc_c / 4, 64<<20);
3827         /* set max to 1/2 of all memory */
3828         arc_c_max = MAX(arc_c * 4, arc_c_max);
3829
3830         /*
3831          * Allow the tunables to override our calculations if they are
3832          * reasonable (ie. over 64MB)
3833          */
3834         if (zfs_arc_max > 64<<20 && zfs_arc_max < physmem * PAGESIZE)
3835                 arc_c_max = zfs_arc_max;
3836         if (zfs_arc_min > 64<<20 && zfs_arc_min <= arc_c_max)
3837                 arc_c_min = zfs_arc_min;
3838
3839         arc_c = arc_c_max;
3840         arc_p = (arc_c >> 1);
3841
3842         /* limit meta-data to 1/4 of the arc capacity */
3843         arc_meta_limit = arc_c_max / 4;
3844         arc_meta_max = 0;
3845
3846         /* Allow the tunable to override if it is reasonable */
3847         if (zfs_arc_meta_limit > 0 && zfs_arc_meta_limit <= arc_c_max)
3848                 arc_meta_limit = zfs_arc_meta_limit;
3849
3850         if (arc_c_min < arc_meta_limit / 2 && zfs_arc_min == 0)
3851                 arc_c_min = arc_meta_limit / 2;
3852
3853         /* if kmem_flags are set, lets try to use less memory */
3854         if (kmem_debugging())
3855                 arc_c = arc_c / 2;
3856         if (arc_c < arc_c_min)
3857                 arc_c = arc_c_min;
3858
3859         arc_anon = &ARC_anon;
3860         arc_mru = &ARC_mru;
3861         arc_mru_ghost = &ARC_mru_ghost;
3862         arc_mfu = &ARC_mfu;
3863         arc_mfu_ghost = &ARC_mfu_ghost;
3864         arc_l2c_only = &ARC_l2c_only;
3865         arc_size = 0;
3866
3867         mutex_init(&arc_anon->arcs_mtx, NULL, MUTEX_DEFAULT, NULL);
3868         mutex_init(&arc_mru->arcs_mtx, NULL, MUTEX_DEFAULT, NULL);
3869         mutex_init(&arc_mru_ghost->arcs_mtx, NULL, MUTEX_DEFAULT, NULL);
3870         mutex_init(&arc_mfu->arcs_mtx, NULL, MUTEX_DEFAULT, NULL);
3871         mutex_init(&arc_mfu_ghost->arcs_mtx, NULL, MUTEX_DEFAULT, NULL);
3872         mutex_init(&arc_l2c_only->arcs_mtx, NULL, MUTEX_DEFAULT, NULL);
3873
3874         list_create(&arc_mru->arcs_list[ARC_BUFC_METADATA],
3875             sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
3876         list_create(&arc_mru->arcs_list[ARC_BUFC_DATA],
3877             sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
3878         list_create(&arc_mru_ghost->arcs_list[ARC_BUFC_METADATA],
3879             sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
3880         list_create(&arc_mru_ghost->arcs_list[ARC_BUFC_DATA],
3881             sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
3882         list_create(&arc_mfu->arcs_list[ARC_BUFC_METADATA],
3883             sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
3884         list_create(&arc_mfu->arcs_list[ARC_BUFC_DATA],
3885             sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
3886         list_create(&arc_mfu_ghost->arcs_list[ARC_BUFC_METADATA],
3887             sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
3888         list_create(&arc_mfu_ghost->arcs_list[ARC_BUFC_DATA],
3889             sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
3890         list_create(&arc_l2c_only->arcs_list[ARC_BUFC_METADATA],
3891             sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
3892         list_create(&arc_l2c_only->arcs_list[ARC_BUFC_DATA],
3893             sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
3894
3895         buf_init();
3896
3897         arc_thread_exit = 0;
3898         list_create(&arc_prune_list, sizeof (arc_prune_t),
3899             offsetof(arc_prune_t, p_node));
3900         arc_eviction_list = NULL;
3901         mutex_init(&arc_prune_mtx, NULL, MUTEX_DEFAULT, NULL);
3902         mutex_init(&arc_eviction_mtx, NULL, MUTEX_DEFAULT, NULL);
3903         bzero(&arc_eviction_hdr, sizeof (arc_buf_hdr_t));
3904
3905         arc_ksp = kstat_create("zfs", 0, "arcstats", "misc", KSTAT_TYPE_NAMED,
3906             sizeof (arc_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL);
3907
3908         if (arc_ksp != NULL) {
3909                 arc_ksp->ks_data = &arc_stats;
3910                 arc_ksp->ks_update = arc_kstat_update;
3911                 kstat_install(arc_ksp);
3912         }
3913
3914         (void) thread_create(NULL, 0, arc_adapt_thread, NULL, 0, &p0,
3915             TS_RUN, minclsyspri);
3916
3917         arc_dead = FALSE;
3918         arc_warm = B_FALSE;
3919
3920         if (zfs_write_limit_max == 0)
3921                 zfs_write_limit_max = ptob(physmem) >> zfs_write_limit_shift;
3922         else
3923                 zfs_write_limit_shift = 0;
3924         mutex_init(&zfs_write_limit_lock, NULL, MUTEX_DEFAULT, NULL);
3925 }
3926
3927 void
3928 arc_fini(void)
3929 {
3930         arc_prune_t *p;
3931
3932         mutex_enter(&arc_reclaim_thr_lock);
3933 #ifdef _KERNEL
3934         spl_unregister_shrinker(&arc_shrinker);
3935 #endif /* _KERNEL */
3936
3937         arc_thread_exit = 1;
3938         while (arc_thread_exit != 0)
3939                 cv_wait(&arc_reclaim_thr_cv, &arc_reclaim_thr_lock);
3940         mutex_exit(&arc_reclaim_thr_lock);
3941
3942         arc_flush(NULL);
3943
3944         arc_dead = TRUE;
3945
3946         if (arc_ksp != NULL) {
3947                 kstat_delete(arc_ksp);
3948                 arc_ksp = NULL;
3949         }
3950
3951         mutex_enter(&arc_prune_mtx);
3952         while ((p = list_head(&arc_prune_list)) != NULL) {
3953                 list_remove(&arc_prune_list, p);
3954                 refcount_remove(&p->p_refcnt, &arc_prune_list);
3955                 refcount_destroy(&p->p_refcnt);
3956                 kmem_free(p, sizeof (*p));
3957         }
3958         mutex_exit(&arc_prune_mtx);
3959
3960         list_destroy(&arc_prune_list);
3961         mutex_destroy(&arc_prune_mtx);
3962         mutex_destroy(&arc_eviction_mtx);
3963         mutex_destroy(&arc_reclaim_thr_lock);
3964         cv_destroy(&arc_reclaim_thr_cv);
3965
3966         list_destroy(&arc_mru->arcs_list[ARC_BUFC_METADATA]);
3967         list_destroy(&arc_mru_ghost->arcs_list[ARC_BUFC_METADATA]);
3968         list_destroy(&arc_mfu->arcs_list[ARC_BUFC_METADATA]);
3969         list_destroy(&arc_mfu_ghost->arcs_list[ARC_BUFC_METADATA]);
3970         list_destroy(&arc_mru->arcs_list[ARC_BUFC_DATA]);
3971         list_destroy(&arc_mru_ghost->arcs_list[ARC_BUFC_DATA]);
3972         list_destroy(&arc_mfu->arcs_list[ARC_BUFC_DATA]);
3973         list_destroy(&arc_mfu_ghost->arcs_list[ARC_BUFC_DATA]);
3974
3975         mutex_destroy(&arc_anon->arcs_mtx);
3976         mutex_destroy(&arc_mru->arcs_mtx);
3977         mutex_destroy(&arc_mru_ghost->arcs_mtx);
3978         mutex_destroy(&arc_mfu->arcs_mtx);
3979         mutex_destroy(&arc_mfu_ghost->arcs_mtx);
3980         mutex_destroy(&arc_l2c_only->arcs_mtx);
3981
3982         mutex_destroy(&zfs_write_limit_lock);
3983
3984         buf_fini();
3985
3986         ASSERT(arc_loaned_bytes == 0);
3987 }
3988
3989 /*
3990  * Level 2 ARC
3991  *
3992  * The level 2 ARC (L2ARC) is a cache layer in-between main memory and disk.
3993  * It uses dedicated storage devices to hold cached data, which are populated
3994  * using large infrequent writes.  The main role of this cache is to boost
3995  * the performance of random read workloads.  The intended L2ARC devices
3996  * include short-stroked disks, solid state disks, and other media with
3997  * substantially faster read latency than disk.
3998  *
3999  *                 +-----------------------+
4000  *                 |         ARC           |
4001  *                 +-----------------------+
4002  *                    |         ^     ^
4003  *                    |         |     |
4004  *      l2arc_feed_thread()    arc_read()
4005  *                    |         |     |
4006  *                    |  l2arc read   |
4007  *                    V         |     |
4008  *               +---------------+    |
4009  *               |     L2ARC     |    |
4010  *               +---------------+    |
4011  *                   |    ^           |
4012  *          l2arc_write() |           |
4013  *                   |    |           |
4014  *                   V    |           |
4015  *                 +-------+      +-------+
4016  *                 | vdev  |      | vdev  |
4017  *                 | cache |      | cache |
4018  *                 +-------+      +-------+
4019  *                 +=========+     .-----.
4020  *                 :  L2ARC  :    |-_____-|
4021  *                 : devices :    | Disks |
4022  *                 +=========+    `-_____-'
4023  *
4024  * Read requests are satisfied from the following sources, in order:
4025  *
4026  *      1) ARC
4027  *      2) vdev cache of L2ARC devices
4028  *      3) L2ARC devices
4029  *      4) vdev cache of disks
4030  *      5) disks
4031  *
4032  * Some L2ARC device types exhibit extremely slow write performance.
4033  * To accommodate for this there are some significant differences between
4034  * the L2ARC and traditional cache design:
4035  *
4036  * 1. There is no eviction path from the ARC to the L2ARC.  Evictions from
4037  * the ARC behave as usual, freeing buffers and placing headers on ghost
4038  * lists.  The ARC does not send buffers to the L2ARC during eviction as
4039  * this would add inflated write latencies for all ARC memory pressure.
4040  *
4041  * 2. The L2ARC attempts to cache data from the ARC before it is evicted.
4042  * It does this by periodically scanning buffers from the eviction-end of
4043  * the MFU and MRU ARC lists, copying them to the L2ARC devices if they are
4044  * not already there. It scans until a headroom of buffers is satisfied,
4045  * which itself is a buffer for ARC eviction. If a compressible buffer is
4046  * found during scanning and selected for writing to an L2ARC device, we
4047  * temporarily boost scanning headroom during the next scan cycle to make
4048  * sure we adapt to compression effects (which might significantly reduce
4049  * the data volume we write to L2ARC). The thread that does this is
4050  * l2arc_feed_thread(), illustrated below; example sizes are included to
4051  * provide a better sense of ratio than this diagram:
4052  *
4053  *             head -->                        tail
4054  *              +---------------------+----------+
4055  *      ARC_mfu |:::::#:::::::::::::::|o#o###o###|-->.   # already on L2ARC
4056  *              +---------------------+----------+   |   o L2ARC eligible
4057  *      ARC_mru |:#:::::::::::::::::::|#o#ooo####|-->|   : ARC buffer
4058  *              +---------------------+----------+   |
4059  *                   15.9 Gbytes      ^ 32 Mbytes    |
4060  *                                 headroom          |
4061  *                                            l2arc_feed_thread()
4062  *                                                   |
4063  *                       l2arc write hand <--[oooo]--'
4064  *                               |           8 Mbyte
4065  *                               |          write max
4066  *                               V
4067  *                +==============================+
4068  *      L2ARC dev |####|#|###|###|    |####| ... |
4069  *                +==============================+
4070  *                           32 Gbytes
4071  *
4072  * 3. If an ARC buffer is copied to the L2ARC but then hit instead of
4073  * evicted, then the L2ARC has cached a buffer much sooner than it probably
4074  * needed to, potentially wasting L2ARC device bandwidth and storage.  It is
4075  * safe to say that this is an uncommon case, since buffers at the end of
4076  * the ARC lists have moved there due to inactivity.
4077  *
4078  * 4. If the ARC evicts faster than the L2ARC can maintain a headroom,
4079  * then the L2ARC simply misses copying some buffers.  This serves as a
4080  * pressure valve to prevent heavy read workloads from both stalling the ARC
4081  * with waits and clogging the L2ARC with writes.  This also helps prevent
4082  * the potential for the L2ARC to churn if it attempts to cache content too
4083  * quickly, such as during backups of the entire pool.
4084  *
4085  * 5. After system boot and before the ARC has filled main memory, there are
4086  * no evictions from the ARC and so the tails of the ARC_mfu and ARC_mru
4087  * lists can remain mostly static.  Instead of searching from tail of these
4088  * lists as pictured, the l2arc_feed_thread() will search from the list heads
4089  * for eligible buffers, greatly increasing its chance of finding them.
4090  *
4091  * The L2ARC device write speed is also boosted during this time so that
4092  * the L2ARC warms up faster.  Since there have been no ARC evictions yet,
4093  * there are no L2ARC reads, and no fear of degrading read performance
4094  * through increased writes.
4095  *
4096  * 6. Writes to the L2ARC devices are grouped and sent in-sequence, so that
4097  * the vdev queue can aggregate them into larger and fewer writes.  Each
4098  * device is written to in a rotor fashion, sweeping writes through
4099  * available space then repeating.
4100  *
4101  * 7. The L2ARC does not store dirty content.  It never needs to flush
4102  * write buffers back to disk based storage.
4103  *
4104  * 8. If an ARC buffer is written (and dirtied) which also exists in the
4105  * L2ARC, the now stale L2ARC buffer is immediately dropped.
4106  *
4107  * The performance of the L2ARC can be tweaked by a number of tunables, which
4108  * may be necessary for different workloads:
4109  *
4110  *      l2arc_write_max         max write bytes per interval
4111  *      l2arc_write_boost       extra write bytes during device warmup
4112  *      l2arc_noprefetch        skip caching prefetched buffers
4113  *      l2arc_nocompress        skip compressing buffers
4114  *      l2arc_headroom          number of max device writes to precache
4115  *      l2arc_headroom_boost    when we find compressed buffers during ARC
4116  *                              scanning, we multiply headroom by this
4117  *                              percentage factor for the next scan cycle,
4118  *                              since more compressed buffers are likely to
4119  *                              be present
4120  *      l2arc_feed_secs         seconds between L2ARC writing
4121  *
4122  * Tunables may be removed or added as future performance improvements are
4123  * integrated, and also may become zpool properties.
4124  *
4125  * There are three key functions that control how the L2ARC warms up:
4126  *
4127  *      l2arc_write_eligible()  check if a buffer is eligible to cache
4128  *      l2arc_write_size()      calculate how much to write
4129  *      l2arc_write_interval()  calculate sleep delay between writes
4130  *
4131  * These three functions determine what to write, how much, and how quickly
4132  * to send writes.
4133  */
4134
4135 static boolean_t
4136 l2arc_write_eligible(uint64_t spa_guid, arc_buf_hdr_t *ab)
4137 {
4138         /*
4139          * A buffer is *not* eligible for the L2ARC if it:
4140          * 1. belongs to a different spa.
4141          * 2. is already cached on the L2ARC.
4142          * 3. has an I/O in progress (it may be an incomplete read).
4143          * 4. is flagged not eligible (zfs property).
4144          */
4145         if (ab->b_spa != spa_guid || ab->b_l2hdr != NULL ||
4146             HDR_IO_IN_PROGRESS(ab) || !HDR_L2CACHE(ab))
4147                 return (B_FALSE);
4148
4149         return (B_TRUE);
4150 }
4151
4152 static uint64_t
4153 l2arc_write_size(void)
4154 {
4155         uint64_t size;
4156
4157         /*
4158          * Make sure our globals have meaningful values in case the user
4159          * altered them.
4160          */
4161         size = l2arc_write_max;
4162         if (size == 0) {
4163                 cmn_err(CE_NOTE, "Bad value for l2arc_write_max, value must "
4164                     "be greater than zero, resetting it to the default (%d)",
4165                     L2ARC_WRITE_SIZE);
4166                 size = l2arc_write_max = L2ARC_WRITE_SIZE;
4167         }
4168
4169         if (arc_warm == B_FALSE)
4170                 size += l2arc_write_boost;
4171
4172         return (size);
4173
4174 }
4175
4176 static clock_t
4177 l2arc_write_interval(clock_t began, uint64_t wanted, uint64_t wrote)
4178 {
4179         clock_t interval, next, now;
4180
4181         /*
4182          * If the ARC lists are busy, increase our write rate; if the
4183          * lists are stale, idle back.  This is achieved by checking
4184          * how much we previously wrote - if it was more than half of
4185          * what we wanted, schedule the next write much sooner.
4186          */
4187         if (l2arc_feed_again && wrote > (wanted / 2))
4188                 interval = (hz * l2arc_feed_min_ms) / 1000;
4189         else
4190                 interval = hz * l2arc_feed_secs;
4191
4192         now = ddi_get_lbolt();
4193         next = MAX(now, MIN(now + interval, began + interval));
4194
4195         return (next);
4196 }
4197
4198 static void
4199 l2arc_hdr_stat_add(void)
4200 {
4201         ARCSTAT_INCR(arcstat_l2_hdr_size, HDR_SIZE);
4202         ARCSTAT_INCR(arcstat_hdr_size, -HDR_SIZE);
4203 }
4204
4205 static void
4206 l2arc_hdr_stat_remove(void)
4207 {
4208         ARCSTAT_INCR(arcstat_l2_hdr_size, -HDR_SIZE);
4209         ARCSTAT_INCR(arcstat_hdr_size, HDR_SIZE);
4210 }
4211
4212 /*
4213  * Cycle through L2ARC devices.  This is how L2ARC load balances.
4214  * If a device is returned, this also returns holding the spa config lock.
4215  */
4216 static l2arc_dev_t *
4217 l2arc_dev_get_next(void)
4218 {
4219         l2arc_dev_t *first, *next = NULL;
4220
4221         /*
4222          * Lock out the removal of spas (spa_namespace_lock), then removal
4223          * of cache devices (l2arc_dev_mtx).  Once a device has been selected,
4224          * both locks will be dropped and a spa config lock held instead.
4225          */
4226         mutex_enter(&spa_namespace_lock);
4227         mutex_enter(&l2arc_dev_mtx);
4228
4229         /* if there are no vdevs, there is nothing to do */
4230         if (l2arc_ndev == 0)
4231                 goto out;
4232
4233         first = NULL;
4234         next = l2arc_dev_last;
4235         do {
4236                 /* loop around the list looking for a non-faulted vdev */
4237                 if (next == NULL) {
4238                         next = list_head(l2arc_dev_list);
4239                 } else {
4240                         next = list_next(l2arc_dev_list, next);
4241                         if (next == NULL)
4242                                 next = list_head(l2arc_dev_list);
4243                 }
4244
4245                 /* if we have come back to the start, bail out */
4246                 if (first == NULL)
4247                         first = next;
4248                 else if (next == first)
4249                         break;
4250
4251         } while (vdev_is_dead(next->l2ad_vdev));
4252
4253         /* if we were unable to find any usable vdevs, return NULL */
4254         if (vdev_is_dead(next->l2ad_vdev))
4255                 next = NULL;
4256
4257         l2arc_dev_last = next;
4258
4259 out:
4260         mutex_exit(&l2arc_dev_mtx);
4261
4262         /*
4263          * Grab the config lock to prevent the 'next' device from being
4264          * removed while we are writing to it.
4265          */
4266         if (next != NULL)
4267                 spa_config_enter(next->l2ad_spa, SCL_L2ARC, next, RW_READER);
4268         mutex_exit(&spa_namespace_lock);
4269
4270         return (next);
4271 }
4272
4273 /*
4274  * Free buffers that were tagged for destruction.
4275  */
4276 static void
4277 l2arc_do_free_on_write(void)
4278 {
4279         list_t *buflist;
4280         l2arc_data_free_t *df, *df_prev;
4281
4282         mutex_enter(&l2arc_free_on_write_mtx);
4283         buflist = l2arc_free_on_write;
4284
4285         for (df = list_tail(buflist); df; df = df_prev) {
4286                 df_prev = list_prev(buflist, df);
4287                 ASSERT(df->l2df_data != NULL);
4288                 ASSERT(df->l2df_func != NULL);
4289                 df->l2df_func(df->l2df_data, df->l2df_size);
4290                 list_remove(buflist, df);
4291                 kmem_free(df, sizeof (l2arc_data_free_t));
4292         }
4293
4294         mutex_exit(&l2arc_free_on_write_mtx);
4295 }
4296
4297 /*
4298  * A write to a cache device has completed.  Update all headers to allow
4299  * reads from these buffers to begin.
4300  */
4301 static void
4302 l2arc_write_done(zio_t *zio)
4303 {
4304         l2arc_write_callback_t *cb;
4305         l2arc_dev_t *dev;
4306         list_t *buflist;
4307         arc_buf_hdr_t *head, *ab, *ab_prev;
4308         l2arc_buf_hdr_t *abl2;
4309         kmutex_t *hash_lock;
4310
4311         cb = zio->io_private;
4312         ASSERT(cb != NULL);
4313         dev = cb->l2wcb_dev;
4314         ASSERT(dev != NULL);
4315         head = cb->l2wcb_head;
4316         ASSERT(head != NULL);
4317         buflist = dev->l2ad_buflist;
4318         ASSERT(buflist != NULL);
4319         DTRACE_PROBE2(l2arc__iodone, zio_t *, zio,
4320             l2arc_write_callback_t *, cb);
4321
4322         if (zio->io_error != 0)
4323                 ARCSTAT_BUMP(arcstat_l2_writes_error);
4324
4325         mutex_enter(&l2arc_buflist_mtx);
4326
4327         /*
4328          * All writes completed, or an error was hit.
4329          */
4330         for (ab = list_prev(buflist, head); ab; ab = ab_prev) {
4331                 ab_prev = list_prev(buflist, ab);
4332
4333                 hash_lock = HDR_LOCK(ab);
4334                 if (!mutex_tryenter(hash_lock)) {
4335                         /*
4336                          * This buffer misses out.  It may be in a stage
4337                          * of eviction.  Its ARC_L2_WRITING flag will be
4338                          * left set, denying reads to this buffer.
4339                          */
4340                         ARCSTAT_BUMP(arcstat_l2_writes_hdr_miss);
4341                         continue;
4342                 }
4343
4344                 abl2 = ab->b_l2hdr;
4345
4346                 /*
4347                  * Release the temporary compressed buffer as soon as possible.
4348                  */
4349                 if (abl2->b_compress != ZIO_COMPRESS_OFF)
4350                         l2arc_release_cdata_buf(ab);
4351
4352                 if (zio->io_error != 0) {
4353                         /*
4354                          * Error - drop L2ARC entry.
4355                          */
4356                         list_remove(buflist, ab);
4357                         ARCSTAT_INCR(arcstat_l2_asize, -abl2->b_asize);
4358                         ab->b_l2hdr = NULL;
4359                         kmem_free(abl2, sizeof (l2arc_buf_hdr_t));
4360                         arc_space_return(L2HDR_SIZE, ARC_SPACE_L2HDRS);
4361                         ARCSTAT_INCR(arcstat_l2_size, -ab->b_size);
4362                 }
4363
4364                 /*
4365                  * Allow ARC to begin reads to this L2ARC entry.
4366                  */
4367                 ab->b_flags &= ~ARC_L2_WRITING;
4368
4369                 mutex_exit(hash_lock);
4370         }
4371
4372         atomic_inc_64(&l2arc_writes_done);
4373         list_remove(buflist, head);
4374         kmem_cache_free(hdr_cache, head);
4375         mutex_exit(&l2arc_buflist_mtx);
4376
4377         l2arc_do_free_on_write();
4378
4379         kmem_free(cb, sizeof (l2arc_write_callback_t));
4380 }
4381
4382 /*
4383  * A read to a cache device completed.  Validate buffer contents before
4384  * handing over to the regular ARC routines.
4385  */
4386 static void
4387 l2arc_read_done(zio_t *zio)
4388 {
4389         l2arc_read_callback_t *cb;
4390         arc_buf_hdr_t *hdr;
4391         arc_buf_t *buf;
4392         kmutex_t *hash_lock;
4393         int equal;
4394
4395         ASSERT(zio->io_vd != NULL);
4396         ASSERT(zio->io_flags & ZIO_FLAG_DONT_PROPAGATE);
4397
4398         spa_config_exit(zio->io_spa, SCL_L2ARC, zio->io_vd);
4399
4400         cb = zio->io_private;
4401         ASSERT(cb != NULL);
4402         buf = cb->l2rcb_buf;
4403         ASSERT(buf != NULL);
4404
4405         hash_lock = HDR_LOCK(buf->b_hdr);
4406         mutex_enter(hash_lock);
4407         hdr = buf->b_hdr;
4408         ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
4409
4410         /*
4411          * If the buffer was compressed, decompress it first.
4412          */
4413         if (cb->l2rcb_compress != ZIO_COMPRESS_OFF)
4414                 l2arc_decompress_zio(zio, hdr, cb->l2rcb_compress);
4415         ASSERT(zio->io_data != NULL);
4416
4417         /*
4418          * Check this survived the L2ARC journey.
4419          */
4420         equal = arc_cksum_equal(buf);
4421         if (equal && zio->io_error == 0 && !HDR_L2_EVICTED(hdr)) {
4422                 mutex_exit(hash_lock);
4423                 zio->io_private = buf;
4424                 zio->io_bp_copy = cb->l2rcb_bp; /* XXX fix in L2ARC 2.0 */
4425                 zio->io_bp = &zio->io_bp_copy;  /* XXX fix in L2ARC 2.0 */
4426                 arc_read_done(zio);
4427         } else {
4428                 mutex_exit(hash_lock);
4429                 /*
4430                  * Buffer didn't survive caching.  Increment stats and
4431                  * reissue to the original storage device.
4432                  */
4433                 if (zio->io_error != 0) {
4434                         ARCSTAT_BUMP(arcstat_l2_io_error);
4435                 } else {
4436                         zio->io_error = EIO;
4437                 }
4438                 if (!equal)
4439                         ARCSTAT_BUMP(arcstat_l2_cksum_bad);
4440
4441                 /*
4442                  * If there's no waiter, issue an async i/o to the primary
4443                  * storage now.  If there *is* a waiter, the caller must
4444                  * issue the i/o in a context where it's OK to block.
4445                  */
4446                 if (zio->io_waiter == NULL) {
4447                         zio_t *pio = zio_unique_parent(zio);
4448
4449                         ASSERT(!pio || pio->io_child_type == ZIO_CHILD_LOGICAL);
4450
4451                         zio_nowait(zio_read(pio, cb->l2rcb_spa, &cb->l2rcb_bp,
4452                             buf->b_data, zio->io_size, arc_read_done, buf,
4453                             zio->io_priority, cb->l2rcb_flags, &cb->l2rcb_zb));
4454                 }
4455         }
4456
4457         kmem_free(cb, sizeof (l2arc_read_callback_t));
4458 }
4459
4460 /*
4461  * This is the list priority from which the L2ARC will search for pages to
4462  * cache.  This is used within loops (0..3) to cycle through lists in the
4463  * desired order.  This order can have a significant effect on cache
4464  * performance.
4465  *
4466  * Currently the metadata lists are hit first, MFU then MRU, followed by
4467  * the data lists.  This function returns a locked list, and also returns
4468  * the lock pointer.
4469  */
4470 static list_t *
4471 l2arc_list_locked(int list_num, kmutex_t **lock)
4472 {
4473         list_t *list = NULL;
4474
4475         ASSERT(list_num >= 0 && list_num <= 3);
4476
4477         switch (list_num) {
4478         case 0:
4479                 list = &arc_mfu->arcs_list[ARC_BUFC_METADATA];
4480                 *lock = &arc_mfu->arcs_mtx;
4481                 break;
4482         case 1:
4483                 list = &arc_mru->arcs_list[ARC_BUFC_METADATA];
4484                 *lock = &arc_mru->arcs_mtx;
4485                 break;
4486         case 2:
4487                 list = &arc_mfu->arcs_list[ARC_BUFC_DATA];
4488                 *lock = &arc_mfu->arcs_mtx;
4489                 break;
4490         case 3:
4491                 list = &arc_mru->arcs_list[ARC_BUFC_DATA];
4492                 *lock = &arc_mru->arcs_mtx;
4493                 break;
4494         }
4495
4496         ASSERT(!(MUTEX_HELD(*lock)));
4497         mutex_enter(*lock);
4498         return (list);
4499 }
4500
4501 /*
4502  * Evict buffers from the device write hand to the distance specified in
4503  * bytes.  This distance may span populated buffers, it may span nothing.
4504  * This is clearing a region on the L2ARC device ready for writing.
4505  * If the 'all' boolean is set, every buffer is evicted.
4506  */
4507 static void
4508 l2arc_evict(l2arc_dev_t *dev, uint64_t distance, boolean_t all)
4509 {
4510         list_t *buflist;
4511         l2arc_buf_hdr_t *abl2;
4512         arc_buf_hdr_t *ab, *ab_prev;
4513         kmutex_t *hash_lock;
4514         uint64_t taddr;
4515
4516         buflist = dev->l2ad_buflist;
4517
4518         if (buflist == NULL)
4519                 return;
4520
4521         if (!all && dev->l2ad_first) {
4522                 /*
4523                  * This is the first sweep through the device.  There is
4524                  * nothing to evict.
4525                  */
4526                 return;
4527         }
4528
4529         if (dev->l2ad_hand >= (dev->l2ad_end - (2 * distance))) {
4530                 /*
4531                  * When nearing the end of the device, evict to the end
4532                  * before the device write hand jumps to the start.
4533                  */
4534                 taddr = dev->l2ad_end;
4535         } else {
4536                 taddr = dev->l2ad_hand + distance;
4537         }
4538         DTRACE_PROBE4(l2arc__evict, l2arc_dev_t *, dev, list_t *, buflist,
4539             uint64_t, taddr, boolean_t, all);
4540
4541 top:
4542         mutex_enter(&l2arc_buflist_mtx);
4543         for (ab = list_tail(buflist); ab; ab = ab_prev) {
4544                 ab_prev = list_prev(buflist, ab);
4545
4546                 hash_lock = HDR_LOCK(ab);
4547                 if (!mutex_tryenter(hash_lock)) {
4548                         /*
4549                          * Missed the hash lock.  Retry.
4550                          */
4551                         ARCSTAT_BUMP(arcstat_l2_evict_lock_retry);
4552                         mutex_exit(&l2arc_buflist_mtx);
4553                         mutex_enter(hash_lock);
4554                         mutex_exit(hash_lock);
4555                         goto top;
4556                 }
4557
4558                 if (HDR_L2_WRITE_HEAD(ab)) {
4559                         /*
4560                          * We hit a write head node.  Leave it for
4561                          * l2arc_write_done().
4562                          */
4563                         list_remove(buflist, ab);
4564                         mutex_exit(hash_lock);
4565                         continue;
4566                 }
4567
4568                 if (!all && ab->b_l2hdr != NULL &&
4569                     (ab->b_l2hdr->b_daddr > taddr ||
4570                     ab->b_l2hdr->b_daddr < dev->l2ad_hand)) {
4571                         /*
4572                          * We've evicted to the target address,
4573                          * or the end of the device.
4574                          */
4575                         mutex_exit(hash_lock);
4576                         break;
4577                 }
4578
4579                 if (HDR_FREE_IN_PROGRESS(ab)) {
4580                         /*
4581                          * Already on the path to destruction.
4582                          */
4583                         mutex_exit(hash_lock);
4584                         continue;
4585                 }
4586
4587                 if (ab->b_state == arc_l2c_only) {
4588                         ASSERT(!HDR_L2_READING(ab));
4589                         /*
4590                          * This doesn't exist in the ARC.  Destroy.
4591                          * arc_hdr_destroy() will call list_remove()
4592                          * and decrement arcstat_l2_size.
4593                          */
4594                         arc_change_state(arc_anon, ab, hash_lock);
4595                         arc_hdr_destroy(ab);
4596                 } else {
4597                         /*
4598                          * Invalidate issued or about to be issued
4599                          * reads, since we may be about to write
4600                          * over this location.
4601                          */
4602                         if (HDR_L2_READING(ab)) {
4603                                 ARCSTAT_BUMP(arcstat_l2_evict_reading);
4604                                 ab->b_flags |= ARC_L2_EVICTED;
4605                         }
4606
4607                         /*
4608                          * Tell ARC this no longer exists in L2ARC.
4609                          */
4610                         if (ab->b_l2hdr != NULL) {
4611                                 abl2 = ab->b_l2hdr;
4612                                 ARCSTAT_INCR(arcstat_l2_asize, -abl2->b_asize);
4613                                 ab->b_l2hdr = NULL;
4614                                 kmem_free(abl2, sizeof (l2arc_buf_hdr_t));
4615                                 arc_space_return(L2HDR_SIZE, ARC_SPACE_L2HDRS);
4616                                 ARCSTAT_INCR(arcstat_l2_size, -ab->b_size);
4617                         }
4618                         list_remove(buflist, ab);
4619
4620                         /*
4621                          * This may have been leftover after a
4622                          * failed write.
4623                          */
4624                         ab->b_flags &= ~ARC_L2_WRITING;
4625                 }
4626                 mutex_exit(hash_lock);
4627         }
4628         mutex_exit(&l2arc_buflist_mtx);
4629
4630         vdev_space_update(dev->l2ad_vdev, -(taddr - dev->l2ad_evict), 0, 0);
4631         dev->l2ad_evict = taddr;
4632 }
4633
4634 /*
4635  * Find and write ARC buffers to the L2ARC device.
4636  *
4637  * An ARC_L2_WRITING flag is set so that the L2ARC buffers are not valid
4638  * for reading until they have completed writing.
4639  * The headroom_boost is an in-out parameter used to maintain headroom boost
4640  * state between calls to this function.
4641  *
4642  * Returns the number of bytes actually written (which may be smaller than
4643  * the delta by which the device hand has changed due to alignment).
4644  */
4645 static uint64_t
4646 l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz,
4647     boolean_t *headroom_boost)
4648 {
4649         arc_buf_hdr_t *ab, *ab_prev, *head;
4650         list_t *list;
4651         uint64_t write_asize, write_psize, write_sz, headroom,
4652             buf_compress_minsz;
4653         void *buf_data;
4654         kmutex_t *list_lock = NULL;
4655         boolean_t full;
4656         l2arc_write_callback_t *cb;
4657         zio_t *pio, *wzio;
4658         uint64_t guid = spa_load_guid(spa);
4659         int try;
4660         const boolean_t do_headroom_boost = *headroom_boost;
4661
4662         ASSERT(dev->l2ad_vdev != NULL);
4663
4664         /* Lower the flag now, we might want to raise it again later. */
4665         *headroom_boost = B_FALSE;
4666
4667         pio = NULL;
4668         write_sz = write_asize = write_psize = 0;
4669         full = B_FALSE;
4670         head = kmem_cache_alloc(hdr_cache, KM_PUSHPAGE);
4671         head->b_flags |= ARC_L2_WRITE_HEAD;
4672
4673         /*
4674          * We will want to try to compress buffers that are at least 2x the
4675          * device sector size.
4676          */
4677         buf_compress_minsz = 2 << dev->l2ad_vdev->vdev_ashift;
4678
4679         /*
4680          * Copy buffers for L2ARC writing.
4681          */
4682         mutex_enter(&l2arc_buflist_mtx);
4683         for (try = 0; try <= 3; try++) {
4684                 uint64_t passed_sz = 0;
4685
4686                 list = l2arc_list_locked(try, &list_lock);
4687
4688                 /*
4689                  * L2ARC fast warmup.
4690                  *
4691                  * Until the ARC is warm and starts to evict, read from the
4692                  * head of the ARC lists rather than the tail.
4693                  */
4694                 if (arc_warm == B_FALSE)
4695                         ab = list_head(list);
4696                 else
4697                         ab = list_tail(list);
4698
4699                 headroom = target_sz * l2arc_headroom;
4700                 if (do_headroom_boost)
4701                         headroom = (headroom * l2arc_headroom_boost) / 100;
4702
4703                 for (; ab; ab = ab_prev) {
4704                         l2arc_buf_hdr_t *l2hdr;
4705                         kmutex_t *hash_lock;
4706                         uint64_t buf_sz;
4707
4708                         if (arc_warm == B_FALSE)
4709                                 ab_prev = list_next(list, ab);
4710                         else
4711                                 ab_prev = list_prev(list, ab);
4712
4713                         hash_lock = HDR_LOCK(ab);
4714                         if (!mutex_tryenter(hash_lock)) {
4715                                 /*
4716                                  * Skip this buffer rather than waiting.
4717                                  */
4718                                 continue;
4719                         }
4720
4721                         passed_sz += ab->b_size;
4722                         if (passed_sz > headroom) {
4723                                 /*
4724                                  * Searched too far.
4725                                  */
4726                                 mutex_exit(hash_lock);
4727                                 break;
4728                         }
4729
4730                         if (!l2arc_write_eligible(guid, ab)) {
4731                                 mutex_exit(hash_lock);
4732                                 continue;
4733                         }
4734
4735                         if ((write_sz + ab->b_size) > target_sz) {
4736                                 full = B_TRUE;
4737                                 mutex_exit(hash_lock);
4738                                 break;
4739                         }
4740
4741                         if (pio == NULL) {
4742                                 /*
4743                                  * Insert a dummy header on the buflist so
4744                                  * l2arc_write_done() can find where the
4745                                  * write buffers begin without searching.
4746                                  */
4747                                 list_insert_head(dev->l2ad_buflist, head);
4748
4749                                 cb = kmem_alloc(sizeof (l2arc_write_callback_t),
4750                                                 KM_PUSHPAGE);
4751                                 cb->l2wcb_dev = dev;
4752                                 cb->l2wcb_head = head;
4753                                 pio = zio_root(spa, l2arc_write_done, cb,
4754                                     ZIO_FLAG_CANFAIL);
4755                         }
4756
4757                         /*
4758                          * Create and add a new L2ARC header.
4759                          */
4760                         l2hdr = kmem_zalloc(sizeof (l2arc_buf_hdr_t),
4761                             KM_PUSHPAGE);
4762                         l2hdr->b_dev = dev;
4763                         arc_space_consume(L2HDR_SIZE, ARC_SPACE_L2HDRS);
4764
4765                         ab->b_flags |= ARC_L2_WRITING;
4766
4767                         /*
4768                          * Temporarily stash the data buffer in b_tmp_cdata.
4769                          * The subsequent write step will pick it up from
4770                          * there. This is because can't access ab->b_buf
4771                          * without holding the hash_lock, which we in turn
4772                          * can't access without holding the ARC list locks
4773                          * (which we want to avoid during compression/writing)
4774                          */
4775                         l2hdr->b_compress = ZIO_COMPRESS_OFF;
4776                         l2hdr->b_asize = ab->b_size;
4777                         l2hdr->b_tmp_cdata = ab->b_buf->b_data;
4778
4779                         buf_sz = ab->b_size;
4780                         ab->b_l2hdr = l2hdr;
4781
4782                         list_insert_head(dev->l2ad_buflist, ab);
4783
4784                         /*
4785                          * Compute and store the buffer cksum before
4786                          * writing.  On debug the cksum is verified first.
4787                          */
4788                         arc_cksum_verify(ab->b_buf);
4789                         arc_cksum_compute(ab->b_buf, B_TRUE);
4790
4791                         mutex_exit(hash_lock);
4792
4793                         write_sz += buf_sz;
4794                 }
4795
4796                 mutex_exit(list_lock);
4797
4798                 if (full == B_TRUE)
4799                         break;
4800         }
4801
4802         /* No buffers selected for writing? */
4803         if (pio == NULL) {
4804                 ASSERT0(write_sz);
4805                 mutex_exit(&l2arc_buflist_mtx);
4806                 kmem_cache_free(hdr_cache, head);
4807                 return (0);
4808         }
4809
4810         /*
4811          * Now start writing the buffers. We're starting at the write head
4812          * and work backwards, retracing the course of the buffer selector
4813          * loop above.
4814          */
4815         for (ab = list_prev(dev->l2ad_buflist, head); ab;
4816             ab = list_prev(dev->l2ad_buflist, ab)) {
4817                 l2arc_buf_hdr_t *l2hdr;
4818                 uint64_t buf_sz;
4819
4820                 /*
4821                  * We shouldn't need to lock the buffer here, since we flagged
4822                  * it as ARC_L2_WRITING in the previous step, but we must take
4823                  * care to only access its L2 cache parameters. In particular,
4824                  * ab->b_buf may be invalid by now due to ARC eviction.
4825                  */
4826                 l2hdr = ab->b_l2hdr;
4827                 l2hdr->b_daddr = dev->l2ad_hand;
4828
4829                 if (!l2arc_nocompress && (ab->b_flags & ARC_L2COMPRESS) &&
4830                     l2hdr->b_asize >= buf_compress_minsz) {
4831                         if (l2arc_compress_buf(l2hdr)) {
4832                                 /*
4833                                  * If compression succeeded, enable headroom
4834                                  * boost on the next scan cycle.
4835                                  */
4836                                 *headroom_boost = B_TRUE;
4837                         }
4838                 }
4839
4840                 /*
4841                  * Pick up the buffer data we had previously stashed away
4842                  * (and now potentially also compressed).
4843                  */
4844                 buf_data = l2hdr->b_tmp_cdata;
4845                 buf_sz = l2hdr->b_asize;
4846
4847                 /* Compression may have squashed the buffer to zero length. */
4848                 if (buf_sz != 0) {
4849                         uint64_t buf_p_sz;
4850
4851                         wzio = zio_write_phys(pio, dev->l2ad_vdev,
4852                             dev->l2ad_hand, buf_sz, buf_data, ZIO_CHECKSUM_OFF,
4853                             NULL, NULL, ZIO_PRIORITY_ASYNC_WRITE,
4854                             ZIO_FLAG_CANFAIL, B_FALSE);
4855
4856                         DTRACE_PROBE2(l2arc__write, vdev_t *, dev->l2ad_vdev,
4857                             zio_t *, wzio);
4858                         (void) zio_nowait(wzio);
4859
4860                         write_asize += buf_sz;
4861                         /*
4862                          * Keep the clock hand suitably device-aligned.
4863                          */
4864                         buf_p_sz = vdev_psize_to_asize(dev->l2ad_vdev, buf_sz);
4865                         write_psize += buf_p_sz;
4866                         dev->l2ad_hand += buf_p_sz;
4867                 }
4868         }
4869
4870         mutex_exit(&l2arc_buflist_mtx);
4871
4872         ASSERT3U(write_asize, <=, target_sz);
4873         ARCSTAT_BUMP(arcstat_l2_writes_sent);
4874         ARCSTAT_INCR(arcstat_l2_write_bytes, write_asize);
4875         ARCSTAT_INCR(arcstat_l2_size, write_sz);
4876         ARCSTAT_INCR(arcstat_l2_asize, write_asize);
4877         vdev_space_update(dev->l2ad_vdev, write_psize, 0, 0);
4878
4879         /*
4880          * Bump device hand to the device start if it is approaching the end.
4881          * l2arc_evict() will already have evicted ahead for this case.
4882          */
4883         if (dev->l2ad_hand >= (dev->l2ad_end - target_sz)) {
4884                 vdev_space_update(dev->l2ad_vdev,
4885                     dev->l2ad_end - dev->l2ad_hand, 0, 0);
4886                 dev->l2ad_hand = dev->l2ad_start;
4887                 dev->l2ad_evict = dev->l2ad_start;
4888                 dev->l2ad_first = B_FALSE;
4889         }
4890
4891         dev->l2ad_writing = B_TRUE;
4892         (void) zio_wait(pio);
4893         dev->l2ad_writing = B_FALSE;
4894
4895         return (write_asize);
4896 }
4897
4898 /*
4899  * Compresses an L2ARC buffer.
4900  * The data to be compressed must be prefilled in l2hdr->b_tmp_cdata and its
4901  * size in l2hdr->b_asize. This routine tries to compress the data and
4902  * depending on the compression result there are three possible outcomes:
4903  * *) The buffer was incompressible. The original l2hdr contents were left
4904  *    untouched and are ready for writing to an L2 device.
4905  * *) The buffer was all-zeros, so there is no need to write it to an L2
4906  *    device. To indicate this situation b_tmp_cdata is NULL'ed, b_asize is
4907  *    set to zero and b_compress is set to ZIO_COMPRESS_EMPTY.
4908  * *) Compression succeeded and b_tmp_cdata was replaced with a temporary
4909  *    data buffer which holds the compressed data to be written, and b_asize
4910  *    tells us how much data there is. b_compress is set to the appropriate
4911  *    compression algorithm. Once writing is done, invoke
4912  *    l2arc_release_cdata_buf on this l2hdr to free this temporary buffer.
4913  *
4914  * Returns B_TRUE if compression succeeded, or B_FALSE if it didn't (the
4915  * buffer was incompressible).
4916  */
4917 static boolean_t
4918 l2arc_compress_buf(l2arc_buf_hdr_t *l2hdr)
4919 {
4920         void *cdata;
4921         size_t csize, len;
4922
4923         ASSERT(l2hdr->b_compress == ZIO_COMPRESS_OFF);
4924         ASSERT(l2hdr->b_tmp_cdata != NULL);
4925
4926         len = l2hdr->b_asize;
4927         cdata = zio_data_buf_alloc(len);
4928         csize = zio_compress_data(ZIO_COMPRESS_LZ4, l2hdr->b_tmp_cdata,
4929             cdata, l2hdr->b_asize);
4930
4931         if (csize == 0) {
4932                 /* zero block, indicate that there's nothing to write */
4933                 zio_data_buf_free(cdata, len);
4934                 l2hdr->b_compress = ZIO_COMPRESS_EMPTY;
4935                 l2hdr->b_asize = 0;
4936                 l2hdr->b_tmp_cdata = NULL;
4937                 ARCSTAT_BUMP(arcstat_l2_compress_zeros);
4938                 return (B_TRUE);
4939         } else if (csize > 0 && csize < len) {
4940                 /*
4941                  * Compression succeeded, we'll keep the cdata around for
4942                  * writing and release it afterwards.
4943                  */
4944                 l2hdr->b_compress = ZIO_COMPRESS_LZ4;
4945                 l2hdr->b_asize = csize;
4946                 l2hdr->b_tmp_cdata = cdata;
4947                 ARCSTAT_BUMP(arcstat_l2_compress_successes);
4948                 return (B_TRUE);
4949         } else {
4950                 /*
4951                  * Compression failed, release the compressed buffer.
4952                  * l2hdr will be left unmodified.
4953                  */
4954                 zio_data_buf_free(cdata, len);
4955                 ARCSTAT_BUMP(arcstat_l2_compress_failures);
4956                 return (B_FALSE);
4957         }
4958 }
4959
4960 /*
4961  * Decompresses a zio read back from an l2arc device. On success, the
4962  * underlying zio's io_data buffer is overwritten by the uncompressed
4963  * version. On decompression error (corrupt compressed stream), the
4964  * zio->io_error value is set to signal an I/O error.
4965  *
4966  * Please note that the compressed data stream is not checksummed, so
4967  * if the underlying device is experiencing data corruption, we may feed
4968  * corrupt data to the decompressor, so the decompressor needs to be
4969  * able to handle this situation (LZ4 does).
4970  */
4971 static void
4972 l2arc_decompress_zio(zio_t *zio, arc_buf_hdr_t *hdr, enum zio_compress c)
4973 {
4974         uint64_t csize;
4975         void *cdata;
4976
4977         ASSERT(L2ARC_IS_VALID_COMPRESS(c));
4978
4979         if (zio->io_error != 0) {
4980                 /*
4981                  * An io error has occured, just restore the original io
4982                  * size in preparation for a main pool read.
4983                  */
4984                 zio->io_orig_size = zio->io_size = hdr->b_size;
4985                 return;
4986         }
4987
4988         if (c == ZIO_COMPRESS_EMPTY) {
4989                 /*
4990                  * An empty buffer results in a null zio, which means we
4991                  * need to fill its io_data after we're done restoring the
4992                  * buffer's contents.
4993                  */
4994                 ASSERT(hdr->b_buf != NULL);
4995                 bzero(hdr->b_buf->b_data, hdr->b_size);
4996                 zio->io_data = zio->io_orig_data = hdr->b_buf->b_data;
4997         } else {
4998                 ASSERT(zio->io_data != NULL);
4999                 /*
5000                  * We copy the compressed data from the start of the arc buffer
5001                  * (the zio_read will have pulled in only what we need, the
5002                  * rest is garbage which we will overwrite at decompression)
5003                  * and then decompress back to the ARC data buffer. This way we
5004                  * can minimize copying by simply decompressing back over the
5005                  * original compressed data (rather than decompressing to an
5006                  * aux buffer and then copying back the uncompressed buffer,
5007                  * which is likely to be much larger).
5008                  */
5009                 csize = zio->io_size;
5010                 cdata = zio_data_buf_alloc(csize);
5011                 bcopy(zio->io_data, cdata, csize);
5012                 if (zio_decompress_data(c, cdata, zio->io_data, csize,
5013                     hdr->b_size) != 0)
5014                         zio->io_error = EIO;
5015                 zio_data_buf_free(cdata, csize);
5016         }
5017
5018         /* Restore the expected uncompressed IO size. */
5019         zio->io_orig_size = zio->io_size = hdr->b_size;
5020 }
5021
5022 /*
5023  * Releases the temporary b_tmp_cdata buffer in an l2arc header structure.
5024  * This buffer serves as a temporary holder of compressed data while
5025  * the buffer entry is being written to an l2arc device. Once that is
5026  * done, we can dispose of it.
5027  */
5028 static void
5029 l2arc_release_cdata_buf(arc_buf_hdr_t *ab)
5030 {
5031         l2arc_buf_hdr_t *l2hdr = ab->b_l2hdr;
5032
5033         if (l2hdr->b_compress == ZIO_COMPRESS_LZ4) {
5034                 /*
5035                  * If the data was compressed, then we've allocated a
5036                  * temporary buffer for it, so now we need to release it.
5037                  */
5038                 ASSERT(l2hdr->b_tmp_cdata != NULL);
5039                 zio_data_buf_free(l2hdr->b_tmp_cdata, ab->b_size);
5040         }
5041         l2hdr->b_tmp_cdata = NULL;
5042 }
5043
5044 /*
5045  * This thread feeds the L2ARC at regular intervals.  This is the beating
5046  * heart of the L2ARC.
5047  */
5048 static void
5049 l2arc_feed_thread(void)
5050 {
5051         callb_cpr_t cpr;
5052         l2arc_dev_t *dev;
5053         spa_t *spa;
5054         uint64_t size, wrote;
5055         clock_t begin, next = ddi_get_lbolt();
5056         boolean_t headroom_boost = B_FALSE;
5057
5058         CALLB_CPR_INIT(&cpr, &l2arc_feed_thr_lock, callb_generic_cpr, FTAG);
5059
5060         mutex_enter(&l2arc_feed_thr_lock);
5061
5062         while (l2arc_thread_exit == 0) {
5063                 CALLB_CPR_SAFE_BEGIN(&cpr);
5064                 (void) cv_timedwait_interruptible(&l2arc_feed_thr_cv,
5065                     &l2arc_feed_thr_lock, next);
5066                 CALLB_CPR_SAFE_END(&cpr, &l2arc_feed_thr_lock);
5067                 next = ddi_get_lbolt() + hz;
5068
5069                 /*
5070                  * Quick check for L2ARC devices.
5071                  */
5072                 mutex_enter(&l2arc_dev_mtx);
5073                 if (l2arc_ndev == 0) {
5074                         mutex_exit(&l2arc_dev_mtx);
5075                         continue;
5076                 }
5077                 mutex_exit(&l2arc_dev_mtx);
5078                 begin = ddi_get_lbolt();
5079
5080                 /*
5081                  * This selects the next l2arc device to write to, and in
5082                  * doing so the next spa to feed from: dev->l2ad_spa.   This
5083                  * will return NULL if there are now no l2arc devices or if
5084                  * they are all faulted.
5085                  *
5086                  * If a device is returned, its spa's config lock is also
5087                  * held to prevent device removal.  l2arc_dev_get_next()
5088                  * will grab and release l2arc_dev_mtx.
5089                  */
5090                 if ((dev = l2arc_dev_get_next()) == NULL)
5091                         continue;
5092
5093                 spa = dev->l2ad_spa;
5094                 ASSERT(spa != NULL);
5095
5096                 /*
5097                  * If the pool is read-only then force the feed thread to
5098                  * sleep a little longer.
5099                  */
5100                 if (!spa_writeable(spa)) {
5101                         next = ddi_get_lbolt() + 5 * l2arc_feed_secs * hz;
5102                         spa_config_exit(spa, SCL_L2ARC, dev);
5103                         continue;
5104                 }
5105
5106                 /*
5107                  * Avoid contributing to memory pressure.
5108                  */
5109                 if (arc_no_grow) {
5110                         ARCSTAT_BUMP(arcstat_l2_abort_lowmem);
5111                         spa_config_exit(spa, SCL_L2ARC, dev);
5112                         continue;
5113                 }
5114
5115                 ARCSTAT_BUMP(arcstat_l2_feeds);
5116
5117                 size = l2arc_write_size();
5118
5119                 /*
5120                  * Evict L2ARC buffers that will be overwritten.
5121                  */
5122                 l2arc_evict(dev, size, B_FALSE);
5123
5124                 /*
5125                  * Write ARC buffers.
5126                  */
5127                 wrote = l2arc_write_buffers(spa, dev, size, &headroom_boost);
5128
5129                 /*
5130                  * Calculate interval between writes.
5131                  */
5132                 next = l2arc_write_interval(begin, size, wrote);
5133                 spa_config_exit(spa, SCL_L2ARC, dev);
5134         }
5135
5136         l2arc_thread_exit = 0;
5137         cv_broadcast(&l2arc_feed_thr_cv);
5138         CALLB_CPR_EXIT(&cpr);           /* drops l2arc_feed_thr_lock */
5139         thread_exit();
5140 }
5141
5142 boolean_t
5143 l2arc_vdev_present(vdev_t *vd)
5144 {
5145         l2arc_dev_t *dev;
5146
5147         mutex_enter(&l2arc_dev_mtx);
5148         for (dev = list_head(l2arc_dev_list); dev != NULL;
5149             dev = list_next(l2arc_dev_list, dev)) {
5150                 if (dev->l2ad_vdev == vd)
5151                         break;
5152         }
5153         mutex_exit(&l2arc_dev_mtx);
5154
5155         return (dev != NULL);
5156 }
5157
5158 /*
5159  * Add a vdev for use by the L2ARC.  By this point the spa has already
5160  * validated the vdev and opened it.
5161  */
5162 void
5163 l2arc_add_vdev(spa_t *spa, vdev_t *vd)
5164 {
5165         l2arc_dev_t *adddev;
5166
5167         ASSERT(!l2arc_vdev_present(vd));
5168
5169         /*
5170          * Create a new l2arc device entry.
5171          */
5172         adddev = kmem_zalloc(sizeof (l2arc_dev_t), KM_SLEEP);
5173         adddev->l2ad_spa = spa;
5174         adddev->l2ad_vdev = vd;
5175         adddev->l2ad_start = VDEV_LABEL_START_SIZE;
5176         adddev->l2ad_end = VDEV_LABEL_START_SIZE + vdev_get_min_asize(vd);
5177         adddev->l2ad_hand = adddev->l2ad_start;
5178         adddev->l2ad_evict = adddev->l2ad_start;
5179         adddev->l2ad_first = B_TRUE;
5180         adddev->l2ad_writing = B_FALSE;
5181         list_link_init(&adddev->l2ad_node);
5182
5183         /*
5184          * This is a list of all ARC buffers that are still valid on the
5185          * device.
5186          */
5187         adddev->l2ad_buflist = kmem_zalloc(sizeof (list_t), KM_SLEEP);
5188         list_create(adddev->l2ad_buflist, sizeof (arc_buf_hdr_t),
5189             offsetof(arc_buf_hdr_t, b_l2node));
5190
5191         vdev_space_update(vd, 0, 0, adddev->l2ad_end - adddev->l2ad_hand);
5192
5193         /*
5194          * Add device to global list
5195          */
5196         mutex_enter(&l2arc_dev_mtx);
5197         list_insert_head(l2arc_dev_list, adddev);
5198         atomic_inc_64(&l2arc_ndev);
5199         mutex_exit(&l2arc_dev_mtx);
5200 }
5201
5202 /*
5203  * Remove a vdev from the L2ARC.
5204  */
5205 void
5206 l2arc_remove_vdev(vdev_t *vd)
5207 {
5208         l2arc_dev_t *dev, *nextdev, *remdev = NULL;
5209
5210         /*
5211          * Find the device by vdev
5212          */
5213         mutex_enter(&l2arc_dev_mtx);
5214         for (dev = list_head(l2arc_dev_list); dev; dev = nextdev) {
5215                 nextdev = list_next(l2arc_dev_list, dev);
5216                 if (vd == dev->l2ad_vdev) {
5217                         remdev = dev;
5218                         break;
5219                 }
5220         }
5221         ASSERT(remdev != NULL);
5222
5223         /*
5224          * Remove device from global list
5225          */
5226         list_remove(l2arc_dev_list, remdev);
5227         l2arc_dev_last = NULL;          /* may have been invalidated */
5228         atomic_dec_64(&l2arc_ndev);
5229         mutex_exit(&l2arc_dev_mtx);
5230
5231         /*
5232          * Clear all buflists and ARC references.  L2ARC device flush.
5233          */
5234         l2arc_evict(remdev, 0, B_TRUE);
5235         list_destroy(remdev->l2ad_buflist);
5236         kmem_free(remdev->l2ad_buflist, sizeof (list_t));
5237         kmem_free(remdev, sizeof (l2arc_dev_t));
5238 }
5239
5240 void
5241 l2arc_init(void)
5242 {
5243         l2arc_thread_exit = 0;
5244         l2arc_ndev = 0;
5245         l2arc_writes_sent = 0;
5246         l2arc_writes_done = 0;
5247
5248         mutex_init(&l2arc_feed_thr_lock, NULL, MUTEX_DEFAULT, NULL);
5249         cv_init(&l2arc_feed_thr_cv, NULL, CV_DEFAULT, NULL);
5250         mutex_init(&l2arc_dev_mtx, NULL, MUTEX_DEFAULT, NULL);
5251         mutex_init(&l2arc_buflist_mtx, NULL, MUTEX_DEFAULT, NULL);
5252         mutex_init(&l2arc_free_on_write_mtx, NULL, MUTEX_DEFAULT, NULL);
5253
5254         l2arc_dev_list = &L2ARC_dev_list;
5255         l2arc_free_on_write = &L2ARC_free_on_write;
5256         list_create(l2arc_dev_list, sizeof (l2arc_dev_t),
5257             offsetof(l2arc_dev_t, l2ad_node));
5258         list_create(l2arc_free_on_write, sizeof (l2arc_data_free_t),
5259             offsetof(l2arc_data_free_t, l2df_list_node));
5260 }
5261
5262 void
5263 l2arc_fini(void)
5264 {
5265         /*
5266          * This is called from dmu_fini(), which is called from spa_fini();
5267          * Because of this, we can assume that all l2arc devices have
5268          * already been removed when the pools themselves were removed.
5269          */
5270
5271         l2arc_do_free_on_write();
5272
5273         mutex_destroy(&l2arc_feed_thr_lock);
5274         cv_destroy(&l2arc_feed_thr_cv);
5275         mutex_destroy(&l2arc_dev_mtx);
5276         mutex_destroy(&l2arc_buflist_mtx);
5277         mutex_destroy(&l2arc_free_on_write_mtx);
5278
5279         list_destroy(l2arc_dev_list);
5280         list_destroy(l2arc_free_on_write);
5281 }
5282
5283 void
5284 l2arc_start(void)
5285 {
5286         if (!(spa_mode_global & FWRITE))
5287                 return;
5288
5289         (void) thread_create(NULL, 0, l2arc_feed_thread, NULL, 0, &p0,
5290             TS_RUN, minclsyspri);
5291 }
5292
5293 void
5294 l2arc_stop(void)
5295 {
5296         if (!(spa_mode_global & FWRITE))
5297                 return;
5298
5299         mutex_enter(&l2arc_feed_thr_lock);
5300         cv_signal(&l2arc_feed_thr_cv);  /* kick thread out of startup */
5301         l2arc_thread_exit = 1;
5302         while (l2arc_thread_exit != 0)
5303                 cv_wait(&l2arc_feed_thr_cv, &l2arc_feed_thr_lock);
5304         mutex_exit(&l2arc_feed_thr_lock);
5305 }
5306
5307 #if defined(_KERNEL) && defined(HAVE_SPL)
5308 EXPORT_SYMBOL(arc_read);
5309 EXPORT_SYMBOL(arc_buf_remove_ref);
5310 EXPORT_SYMBOL(arc_getbuf_func);
5311 EXPORT_SYMBOL(arc_add_prune_callback);
5312 EXPORT_SYMBOL(arc_remove_prune_callback);
5313
5314 module_param(zfs_arc_min, ulong, 0644);
5315 MODULE_PARM_DESC(zfs_arc_min, "Min arc size");
5316
5317 module_param(zfs_arc_max, ulong, 0644);
5318 MODULE_PARM_DESC(zfs_arc_max, "Max arc size");
5319
5320 module_param(zfs_arc_meta_limit, ulong, 0644);
5321 MODULE_PARM_DESC(zfs_arc_meta_limit, "Meta limit for arc size");
5322
5323 module_param(zfs_arc_meta_prune, int, 0644);
5324 MODULE_PARM_DESC(zfs_arc_meta_prune, "Bytes of meta data to prune");
5325
5326 module_param(zfs_arc_grow_retry, int, 0644);
5327 MODULE_PARM_DESC(zfs_arc_grow_retry, "Seconds before growing arc size");
5328
5329 module_param(zfs_arc_shrink_shift, int, 0644);
5330 MODULE_PARM_DESC(zfs_arc_shrink_shift, "log2(fraction of arc to reclaim)");
5331
5332 module_param(zfs_arc_p_min_shift, int, 0644);
5333 MODULE_PARM_DESC(zfs_arc_p_min_shift, "arc_c shift to calc min/max arc_p");
5334
5335 module_param(zfs_disable_dup_eviction, int, 0644);
5336 MODULE_PARM_DESC(zfs_disable_dup_eviction, "disable duplicate buffer eviction");
5337
5338 module_param(zfs_arc_memory_throttle_disable, int, 0644);
5339 MODULE_PARM_DESC(zfs_arc_memory_throttle_disable, "disable memory throttle");
5340
5341 module_param(zfs_arc_min_prefetch_lifespan, int, 0644);
5342 MODULE_PARM_DESC(zfs_arc_min_prefetch_lifespan, "Min life of prefetch block");
5343
5344 module_param(l2arc_write_max, ulong, 0644);
5345 MODULE_PARM_DESC(l2arc_write_max, "Max write bytes per interval");
5346
5347 module_param(l2arc_write_boost, ulong, 0644);
5348 MODULE_PARM_DESC(l2arc_write_boost, "Extra write bytes during device warmup");
5349
5350 module_param(l2arc_headroom, ulong, 0644);
5351 MODULE_PARM_DESC(l2arc_headroom, "Number of max device writes to precache");
5352
5353 module_param(l2arc_headroom_boost, ulong, 0644);
5354 MODULE_PARM_DESC(l2arc_headroom_boost, "Compressed l2arc_headroom multiplier");
5355
5356 module_param(l2arc_feed_secs, ulong, 0644);
5357 MODULE_PARM_DESC(l2arc_feed_secs, "Seconds between L2ARC writing");
5358
5359 module_param(l2arc_feed_min_ms, ulong, 0644);
5360 MODULE_PARM_DESC(l2arc_feed_min_ms, "Min feed interval in milliseconds");
5361
5362 module_param(l2arc_noprefetch, int, 0644);
5363 MODULE_PARM_DESC(l2arc_noprefetch, "Skip caching prefetched buffers");
5364
5365 module_param(l2arc_nocompress, int, 0644);
5366 MODULE_PARM_DESC(l2arc_nocompress, "Skip compressing L2ARC buffers");
5367
5368 module_param(l2arc_feed_again, int, 0644);
5369 MODULE_PARM_DESC(l2arc_feed_again, "Turbo L2ARC warmup");
5370
5371 module_param(l2arc_norw, int, 0644);
5372 MODULE_PARM_DESC(l2arc_norw, "No reads during writes");
5373
5374 #endif