cstyle: Resolve C style issues

[mirror_zfs.git] / module / zfs / arc.c
diff --git a/module/zfs/arc.c b/module/zfs/arc.c

index 3b76d1fbf18d08b742320b496f8d07ec8fd44071..222614c3df326926c085c81f89e2dd601360aa34 100644 (file)
--- a/module/zfs/arc.c
+++ b/module/zfs/arc.c
@@ -20,6 +20,9 @@
   */
  /*
   * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
+ * Copyright (c) 2013 by Delphix. All rights reserved.
+ * Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
   */
  
  /*
@@ -55,11 +58,11 @@
   * tight.
   *
   * 3. The Megiddo and Modha model assumes a fixed page size. All
- * elements of the cache are therefor exactly the same size.  So
+ * elements of the cache are therefore exactly the same size.  So
   * when adjusting the cache size following a cache miss, its simply
   * a matter of choosing a single page to evict.  In our model, we
   * have variable sized cache blocks (rangeing from 512 bytes to
- * 128K bytes).  We therefor choose a set of blocks to evict to make
+ * 128K bytes).  We therefore choose a set of blocks to evict to make
   * space for a cache miss that approximates as closely as possible
   * the space used by the new block.
   *
@@ -74,13 +77,13 @@
   * ways: 1) via a hash table lookup using the DVA as a key,
   * or 2) via one of the ARC lists.  The arc_read() interface
   * uses method 1, while the internal arc algorithms for
- * adjusting the cache use method 2.  We therefor provide two
+ * adjusting the cache use method 2.  We therefore provide two
   * types of locks: 1) the hash table lock array, and 2) the
   * arc list locks.
   *
- * Buffers do not have their own mutexs, rather they rely on the
- * hash table mutexs for the bulk of their protection (i.e. most
- * fields in the arc_buf_hdr_t are protected by these mutexs).
+ * Buffers do not have their own mutexes, rather they rely on the
+ * hash table mutexes for the bulk of their protection (i.e. most
+ * fields in the arc_buf_hdr_t are protected by these mutexes).
   *
   * buf_hash_find() returns the appropriate mutex (held) when it
   * locates the requested buffer in the hash table.  It returns
@@ -104,6 +107,14 @@
   * protected from simultaneous callbacks from arc_buf_evict()
   * and arc_do_user_evicts().
   *
+ * It as also possible to register a callback which is run when the
+ * arc_meta_limit is reached and no buffers can be safely evicted.  In
+ * this case the arc user should drop a reference on some arc buffers so
+ * they can be reclaimed and the arc_meta_limit honored.  For example,
+ * when using the ZPL each dentry holds a references on a znode.  These
+ * dentries must be pruned before the arc buffer holding the znode can
+ * be safely evicted.
+ *
   * Note that the majority of the performance stats are manipulated
   * with atomic operations.
   *
@@ -118,54 +129,77 @@
  
  #include <sys/spa.h>
  #include <sys/zio.h>
+#include <sys/zio_compress.h>
  #include <sys/zfs_context.h>
  #include <sys/arc.h>
-#include <sys/refcount.h>
  #include <sys/vdev.h>
  #include <sys/vdev_impl.h>
+#include <sys/dsl_pool.h>
  #ifdef _KERNEL
  #include <sys/vmsystm.h>
  #include <vm/anon.h>
  #include <sys/fs/swapnode.h>
-#include <sys/dnlc.h>
+#include <sys/zpl.h>
  #endif
  #include <sys/callb.h>
  #include <sys/kstat.h>
+#include <sys/dmu_tx.h>
  #include <zfs_fletcher.h>
  
+#ifndef _KERNEL
+/* set with ZFS_DEBUG=watch, to enable watchpoints on frozen buffers */
+boolean_t arc_watch = B_FALSE;
+#endif
+
  static kmutex_t                arc_reclaim_thr_lock;
  static kcondvar_t      arc_reclaim_thr_cv;     /* used to signal reclaim thr */
  static uint8_t         arc_thread_exit;
  
-extern int zfs_write_limit_shift;
-extern uint64_t zfs_write_limit_max;
-extern kmutex_t zfs_write_limit_lock;
-
-#define        ARC_REDUCE_DNLC_PERCENT 3
-uint_t arc_reduce_dnlc_percent = ARC_REDUCE_DNLC_PERCENT;
+/* number of bytes to prune from caches when at arc_meta_limit is reached */
+int zfs_arc_meta_prune = 1048576;
  
  typedef enum arc_reclaim_strategy {
         ARC_RECLAIM_AGGR,               /* Aggressive reclaim strategy */
         ARC_RECLAIM_CONS                /* Conservative reclaim strategy */
  } arc_reclaim_strategy_t;
  
+/*
+ * The number of iterations through arc_evict_*() before we
+ * drop & reacquire the lock.
+ */
+int arc_evict_iterations = 100;
+
  /* number of seconds before growing cache again */
-static int             arc_grow_retry = 60;
+int zfs_arc_grow_retry = 5;
  
  /* shift of arc_c for calculating both min and max arc_p */
-static int             arc_p_min_shift = 4;
+int zfs_arc_p_min_shift = 4;
  
  /* log2(fraction of arc to reclaim) */
-static int             arc_shrink_shift = 5;
+int zfs_arc_shrink_shift = 5;
  
  /*
   * minimum lifespan of a prefetch block in clock ticks
   * (initialized in arc_init())
   */
-static int             arc_min_prefetch_lifespan;
+int zfs_arc_min_prefetch_lifespan = HZ;
+
+/* disable arc proactive arc throttle due to low memory */
+int zfs_arc_memory_throttle_disable = 1;
+
+/* disable duplicate buffer eviction */
+int zfs_disable_dup_eviction = 0;
+
+/*
+ * If this percent of memory is free, don't throttle.
+ */
+int arc_lotsfree_percent = 10;
  
  static int arc_dead;
  
+/* expiration time for arc_no_grow */
+static clock_t arc_grow_time = 0;
+
  /*
   * The arc has filled available memory and has now warmed up.
   */
@@ -174,12 +208,9 @@ static boolean_t arc_warm;
  /*
   * These tunables are for performance analysis.
   */
-uint64_t zfs_arc_max;
-uint64_t zfs_arc_min;
-uint64_t zfs_arc_meta_limit = 0;
-int zfs_arc_grow_retry = 0;
-int zfs_arc_shrink_shift = 0;
-int zfs_arc_p_min_shift = 0;
+unsigned long zfs_arc_max = 0;
+unsigned long zfs_arc_min = 0;
+unsigned long zfs_arc_meta_limit = 0;
  
  /*
   * Note that buffers can be in one of 6 states:
@@ -218,6 +249,7 @@ typedef struct arc_state {
         uint64_t arcs_lsize[ARC_BUFC_NUMTYPES]; /* amount of evictable data */
         uint64_t arcs_size;     /* total amount of data in this state */
         kmutex_t arcs_mtx;
+       arc_state_type_t arcs_state;
  } arc_state_t;
  
  /* The 6 states: */
@@ -245,7 +277,18 @@ typedef struct arc_stats {
         kstat_named_t arcstat_mfu_ghost_hits;
         kstat_named_t arcstat_deleted;
         kstat_named_t arcstat_recycle_miss;
+       /*
+        * Number of buffers that could not be evicted because the hash lock
+        * was held by another thread.  The lock may not necessarily be held
+        * by something using the same buffer, since hash locks are shared
+        * by multiple buffers.
+        */
         kstat_named_t arcstat_mutex_miss;
+       /*
+        * Number of buffers skipped because they have I/O in progress, are
+        * indrect prefetch buffers that have not lived long enough, or are
+        * not from the spa we're trying to evict from.
+        */
         kstat_named_t arcstat_evict_skip;
         kstat_named_t arcstat_evict_l2_cached;
         kstat_named_t arcstat_evict_l2_eligible;
@@ -263,6 +306,21 @@ typedef struct arc_stats {
         kstat_named_t arcstat_hdr_size;
         kstat_named_t arcstat_data_size;
         kstat_named_t arcstat_other_size;
+       kstat_named_t arcstat_anon_size;
+       kstat_named_t arcstat_anon_evict_data;
+       kstat_named_t arcstat_anon_evict_metadata;
+       kstat_named_t arcstat_mru_size;
+       kstat_named_t arcstat_mru_evict_data;
+       kstat_named_t arcstat_mru_evict_metadata;
+       kstat_named_t arcstat_mru_ghost_size;
+       kstat_named_t arcstat_mru_ghost_evict_data;
+       kstat_named_t arcstat_mru_ghost_evict_metadata;
+       kstat_named_t arcstat_mfu_size;
+       kstat_named_t arcstat_mfu_evict_data;
+       kstat_named_t arcstat_mfu_evict_metadata;
+       kstat_named_t arcstat_mfu_ghost_size;
+       kstat_named_t arcstat_mfu_ghost_evict_data;
+       kstat_named_t arcstat_mfu_ghost_evict_metadata;
         kstat_named_t arcstat_l2_hits;
         kstat_named_t arcstat_l2_misses;
         kstat_named_t arcstat_l2_feeds;
@@ -280,8 +338,24 @@ typedef struct arc_stats {
         kstat_named_t arcstat_l2_cksum_bad;
         kstat_named_t arcstat_l2_io_error;
         kstat_named_t arcstat_l2_size;
+       kstat_named_t arcstat_l2_asize;
         kstat_named_t arcstat_l2_hdr_size;
+       kstat_named_t arcstat_l2_compress_successes;
+       kstat_named_t arcstat_l2_compress_zeros;
+       kstat_named_t arcstat_l2_compress_failures;
         kstat_named_t arcstat_memory_throttle_count;
+       kstat_named_t arcstat_duplicate_buffers;
+       kstat_named_t arcstat_duplicate_buffers_size;
+       kstat_named_t arcstat_duplicate_reads;
+       kstat_named_t arcstat_memory_direct_count;
+       kstat_named_t arcstat_memory_indirect_count;
+       kstat_named_t arcstat_no_grow;
+       kstat_named_t arcstat_tempreserve;
+       kstat_named_t arcstat_loaned_bytes;
+       kstat_named_t arcstat_prune;
+       kstat_named_t arcstat_meta_used;
+       kstat_named_t arcstat_meta_limit;
+       kstat_named_t arcstat_meta_max;
  } arc_stats_t;
  
  static arc_stats_t arc_stats = {
@@ -319,6 +393,21 @@ static arc_stats_t arc_stats = {
         { "hdr_size",                   KSTAT_DATA_UINT64 },
         { "data_size",                  KSTAT_DATA_UINT64 },
         { "other_size",                 KSTAT_DATA_UINT64 },
+       { "anon_size",                  KSTAT_DATA_UINT64 },
+       { "anon_evict_data",            KSTAT_DATA_UINT64 },
+       { "anon_evict_metadata",        KSTAT_DATA_UINT64 },
+       { "mru_size",                   KSTAT_DATA_UINT64 },
+       { "mru_evict_data",             KSTAT_DATA_UINT64 },
+       { "mru_evict_metadata",         KSTAT_DATA_UINT64 },
+       { "mru_ghost_size",             KSTAT_DATA_UINT64 },
+       { "mru_ghost_evict_data",       KSTAT_DATA_UINT64 },
+       { "mru_ghost_evict_metadata",   KSTAT_DATA_UINT64 },
+       { "mfu_size",                   KSTAT_DATA_UINT64 },
+       { "mfu_evict_data",             KSTAT_DATA_UINT64 },
+       { "mfu_evict_metadata",         KSTAT_DATA_UINT64 },
+       { "mfu_ghost_size",             KSTAT_DATA_UINT64 },
+       { "mfu_ghost_evict_data",       KSTAT_DATA_UINT64 },
+       { "mfu_ghost_evict_metadata",   KSTAT_DATA_UINT64 },
         { "l2_hits",                    KSTAT_DATA_UINT64 },
         { "l2_misses",                  KSTAT_DATA_UINT64 },
         { "l2_feeds",                   KSTAT_DATA_UINT64 },
@@ -336,14 +425,30 @@ static arc_stats_t arc_stats = {
         { "l2_cksum_bad",               KSTAT_DATA_UINT64 },
         { "l2_io_error",                KSTAT_DATA_UINT64 },
         { "l2_size",                    KSTAT_DATA_UINT64 },
+       { "l2_asize",                   KSTAT_DATA_UINT64 },
         { "l2_hdr_size",                KSTAT_DATA_UINT64 },
-       { "memory_throttle_count",      KSTAT_DATA_UINT64 }
+       { "l2_compress_successes",      KSTAT_DATA_UINT64 },
+       { "l2_compress_zeros",          KSTAT_DATA_UINT64 },
+       { "l2_compress_failures",       KSTAT_DATA_UINT64 },
+       { "memory_throttle_count",      KSTAT_DATA_UINT64 },
+       { "duplicate_buffers",          KSTAT_DATA_UINT64 },
+       { "duplicate_buffers_size",     KSTAT_DATA_UINT64 },
+       { "duplicate_reads",            KSTAT_DATA_UINT64 },
+       { "memory_direct_count",        KSTAT_DATA_UINT64 },
+       { "memory_indirect_count",      KSTAT_DATA_UINT64 },
+       { "arc_no_grow",                KSTAT_DATA_UINT64 },
+       { "arc_tempreserve",            KSTAT_DATA_UINT64 },
+       { "arc_loaned_bytes",           KSTAT_DATA_UINT64 },
+       { "arc_prune",                  KSTAT_DATA_UINT64 },
+       { "arc_meta_used",              KSTAT_DATA_UINT64 },
+       { "arc_meta_limit",             KSTAT_DATA_UINT64 },
+       { "arc_meta_max",               KSTAT_DATA_UINT64 },
  };
  
  #define        ARCSTAT(stat)   (arc_stats.stat.value.ui64)
  
  #define        ARCSTAT_INCR(stat, val) \
-       atomic_add_64(&arc_stats.stat.value.ui64, (val));
+       atomic_add_64(&arc_stats.stat.value.ui64, (val))
  
  #define        ARCSTAT_BUMP(stat)      ARCSTAT_INCR(stat, 1)
  #define        ARCSTAT_BUMPDOWN(stat)  ARCSTAT_INCR(stat, -1)
@@ -399,13 +504,15 @@ static arc_state_t        *arc_l2c_only;
  #define        arc_c           ARCSTAT(arcstat_c)      /* target size of cache */
  #define        arc_c_min       ARCSTAT(arcstat_c_min)  /* min target cache size */
  #define        arc_c_max       ARCSTAT(arcstat_c_max)  /* max target cache size */
+#define        arc_no_grow     ARCSTAT(arcstat_no_grow)
+#define        arc_tempreserve ARCSTAT(arcstat_tempreserve)
+#define        arc_loaned_bytes        ARCSTAT(arcstat_loaned_bytes)
+#define        arc_meta_limit  ARCSTAT(arcstat_meta_limit) /* max size for metadata */
+#define        arc_meta_used   ARCSTAT(arcstat_meta_used) /* size of metadata */
+#define        arc_meta_max    ARCSTAT(arcstat_meta_max) /* max size of metadata */
  
-static int             arc_no_grow;    /* Don't try to grow cache size */
-static uint64_t                arc_tempreserve;
-static uint64_t                arc_loaned_bytes;
-static uint64_t                arc_meta_used;
-static uint64_t                arc_meta_limit;
-static uint64_t                arc_meta_max = 0;
+#define        L2ARC_IS_VALID_COMPRESS(_c_) \
+       ((_c_) == ZIO_COMPRESS_LZ4 || (_c_) == ZIO_COMPRESS_EMPTY)
  
  typedef struct l2arc_buf_hdr l2arc_buf_hdr_t;
  
@@ -424,6 +531,7 @@ typedef struct arc_write_callback arc_write_callback_t;
  struct arc_write_callback {
         void            *awcb_private;
         arc_done_func_t *awcb_ready;
+       arc_done_func_t *awcb_physdone;
         arc_done_func_t *awcb_done;
         arc_buf_t       *awcb_buf;
  };
@@ -436,7 +544,6 @@ struct arc_buf_hdr {
  
         kmutex_t                b_freeze_lock;
         zio_cksum_t             *b_freeze_cksum;
-       void                    *b_thawed;
  
         arc_buf_hdr_t           *b_hash_next;
         arc_buf_t               *b_buf;
@@ -457,6 +564,11 @@ struct arc_buf_hdr {
  
         /* updated atomically */
         clock_t                 b_arc_access;
+       uint32_t                b_mru_hits;
+       uint32_t                b_mru_ghost_hits;
+       uint32_t                b_mfu_hits;
+       uint32_t                b_mfu_ghost_hits;
+       uint32_t                b_l2_hits;
  
         /* self protecting */
         refcount_t              b_refcnt;
@@ -465,13 +577,17 @@ struct arc_buf_hdr {
         list_node_t             b_l2node;
  };
  
+static list_t arc_prune_list;
+static kmutex_t arc_prune_mtx;
  static arc_buf_t *arc_eviction_list;
  static kmutex_t arc_eviction_mtx;
  static arc_buf_hdr_t arc_eviction_hdr;
  static void arc_get_data_buf(arc_buf_t *buf);
  static void arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock);
  static int arc_evict_needed(arc_buf_contents_t type);
-static void arc_evict_ghost(arc_state_t *state, uint64_t spa, int64_t bytes);
+static void arc_evict_ghost(arc_state_t *state, uint64_t spa, int64_t bytes,
+    arc_buf_contents_t type);
+static void arc_buf_watch(arc_buf_t *buf);
  
  static boolean_t l2arc_write_eligible(uint64_t spa_guid, arc_buf_hdr_t *ab);
  
@@ -523,12 +639,13 @@ static boolean_t l2arc_write_eligible(uint64_t spa_guid, arc_buf_hdr_t *ab);
   * Hash table routines
   */
  
-#define        HT_LOCK_PAD     64
+#define        HT_LOCK_ALIGN   64
+#define        HT_LOCK_PAD     (P2NPHASE(sizeof (kmutex_t), (HT_LOCK_ALIGN)))
  
  struct ht_lock {
         kmutex_t        ht_lock;
  #ifdef _KERNEL
-       unsigned char   pad[(HT_LOCK_PAD - sizeof (kmutex_t))];
+       unsigned char   pad[HT_LOCK_PAD];
  #endif
  };
  
@@ -555,24 +672,29 @@ uint64_t zfs_crc64_table[256];
   */
  
  #define        L2ARC_WRITE_SIZE        (8 * 1024 * 1024)       /* initial write max */
-#define        L2ARC_HEADROOM          2               /* num of writes */
+#define        L2ARC_HEADROOM          2                       /* num of writes */
+/*
+ * If we discover during ARC scan any buffers to be compressed, we boost
+ * our headroom for the next scanning cycle by this percentage multiple.
+ */
+#define        L2ARC_HEADROOM_BOOST    200
  #define        L2ARC_FEED_SECS         1               /* caching interval secs */
  #define        L2ARC_FEED_MIN_MS       200             /* min caching interval ms */
  
  #define        l2arc_writes_sent       ARCSTAT(arcstat_l2_writes_sent)
  #define        l2arc_writes_done       ARCSTAT(arcstat_l2_writes_done)
  
-/*
- * L2ARC Performance Tunables
- */
-uint64_t l2arc_write_max = L2ARC_WRITE_SIZE;   /* default max write size */
-uint64_t l2arc_write_boost = L2ARC_WRITE_SIZE; /* extra write during warmup */
-uint64_t l2arc_headroom = L2ARC_HEADROOM;      /* number of dev writes */
-uint64_t l2arc_feed_secs = L2ARC_FEED_SECS;    /* interval seconds */
-uint64_t l2arc_feed_min_ms = L2ARC_FEED_MIN_MS;        /* min interval milliseconds */
-boolean_t l2arc_noprefetch = B_TRUE;           /* don't cache prefetch bufs */
-boolean_t l2arc_feed_again = B_TRUE;           /* turbo warmup */
-boolean_t l2arc_norw = B_TRUE;                 /* no reads during writes */
+/* L2ARC Performance Tunables */
+unsigned long l2arc_write_max = L2ARC_WRITE_SIZE;      /* def max write size */
+unsigned long l2arc_write_boost = L2ARC_WRITE_SIZE;    /* extra warmup write */
+unsigned long l2arc_headroom = L2ARC_HEADROOM;         /* # of dev writes */
+unsigned long l2arc_headroom_boost = L2ARC_HEADROOM_BOOST;
+unsigned long l2arc_feed_secs = L2ARC_FEED_SECS;       /* interval seconds */
+unsigned long l2arc_feed_min_ms = L2ARC_FEED_MIN_MS;   /* min interval msecs */
+int l2arc_noprefetch = B_TRUE;                 /* don't cache prefetch bufs */
+int l2arc_nocompress = B_FALSE;                        /* don't compress bufs */
+int l2arc_feed_again = B_TRUE;                 /* turbo warmup */
+int l2arc_norw = B_FALSE;                      /* no reads during writes */
  
  /*
   * L2ARC Internals
@@ -581,8 +703,6 @@ typedef struct l2arc_dev {
         vdev_t                  *l2ad_vdev;     /* vdev */
         spa_t                   *l2ad_spa;      /* spa */
         uint64_t                l2ad_hand;      /* next write location */
-       uint64_t                l2ad_write;     /* desired write size, bytes */
-       uint64_t                l2ad_boost;     /* warmup write boost, bytes */
         uint64_t                l2ad_start;     /* first addr on device */
         uint64_t                l2ad_end;       /* last addr on device */
         uint64_t                l2ad_evict;     /* last addr eviction reached */
@@ -603,11 +723,12 @@ static kmutex_t l2arc_free_on_write_mtx;  /* mutex for list */
  static uint64_t l2arc_ndev;                    /* number of devices */
  
  typedef struct l2arc_read_callback {
-       arc_buf_t       *l2rcb_buf;             /* read buffer */
-       spa_t           *l2rcb_spa;             /* spa */
-       blkptr_t        l2rcb_bp;               /* original blkptr */
-       zbookmark_t     l2rcb_zb;               /* original bookmark */
-       int             l2rcb_flags;            /* original flags */
+       arc_buf_t               *l2rcb_buf;             /* read buffer */
+       spa_t                   *l2rcb_spa;             /* spa */
+       blkptr_t                l2rcb_bp;               /* original blkptr */
+       zbookmark_t             l2rcb_zb;               /* original bookmark */
+       int                     l2rcb_flags;            /* original flags */
+       enum zio_compress       l2rcb_compress;         /* applied compress */
  } l2arc_read_callback_t;
  
  typedef struct l2arc_write_callback {
@@ -617,8 +738,15 @@ typedef struct l2arc_write_callback {
  
  struct l2arc_buf_hdr {
         /* protected by arc_buf_hdr  mutex */
-       l2arc_dev_t     *b_dev;                 /* L2ARC device */
-       uint64_t        b_daddr;                /* disk address, offset byte */
+       l2arc_dev_t             *b_dev;         /* L2ARC device */
+       uint64_t                b_daddr;        /* disk address, offset byte */
+       /* compression applied to buffer data */
+       enum zio_compress       b_compress;
+       /* real alloc'd buffer size depending on b_compress applied */
+       uint32_t                b_asize;
+       uint32_t                b_hits;
+       /* temporary buffer holder for in-flight compressed data */
+       void                    *b_tmp_cdata;
  };
  
  typedef struct l2arc_data_free {
@@ -637,6 +765,11 @@ static void l2arc_read_done(zio_t *zio);
  static void l2arc_hdr_stat_add(void);
  static void l2arc_hdr_stat_remove(void);
  
+static boolean_t l2arc_compress_buf(l2arc_buf_hdr_t *l2hdr);
+static void l2arc_decompress_zio(zio_t *zio, arc_buf_hdr_t *hdr,
+    enum zio_compress c);
+static void l2arc_release_cdata_buf(arc_buf_hdr_t *ab);
+
  static uint64_t
  buf_hash(uint64_t spa, const dva_t *dva, uint64_t birth)
  {
@@ -772,8 +905,17 @@ buf_fini(void)
  {
         int i;
  
+#if defined(_KERNEL) && defined(HAVE_SPL)
+       /*
+        * Large allocations which do not require contiguous pages
+        * should be using vmem_free() in the linux kernel\
+        */
+       vmem_free(buf_hash_table.ht_table,
+           (buf_hash_table.ht_mask + 1) * sizeof (void *));
+#else
         kmem_free(buf_hash_table.ht_table,
             (buf_hash_table.ht_mask + 1) * sizeof (void *));
+#endif
         for (i = 0; i < BUF_LOCKS; i++)
                 mutex_destroy(&buf_hash_table.ht_locks[i].ht_lock);
         kmem_cache_destroy(hdr_cache);
@@ -809,7 +951,6 @@ buf_cons(void *vbuf, void *unused, int kmflag)
  
         bzero(buf, sizeof (arc_buf_t));
         mutex_init(&buf->b_evict_lock, NULL, MUTEX_DEFAULT, NULL);
-       rw_init(&buf->b_data_lock, NULL, RW_DEFAULT, NULL);
         arc_space_consume(sizeof (arc_buf_t), ARC_SPACE_HDRS);
  
         return (0);
@@ -839,26 +980,9 @@ buf_dest(void *vbuf, void *unused)
         arc_buf_t *buf = vbuf;
  
         mutex_destroy(&buf->b_evict_lock);
-       rw_destroy(&buf->b_data_lock);
         arc_space_return(sizeof (arc_buf_t), ARC_SPACE_HDRS);
  }
  
-/*
- * Reclaim callback -- invoked when memory is low.
- */
-/* ARGSUSED */
-static void
-hdr_recl(void *unused)
-{
-       dprintf("hdr_recl called\n");
-       /*
-        * umem calls the reclaim func when we destroy the buf cache,
-        * which is after we do arc_fini().
-        */
-       if (!arc_dead)
-               cv_signal(&arc_reclaim_thr_cv);
-}
-
  static void
  buf_init(void)
  {
@@ -875,8 +999,17 @@ buf_init(void)
                 hsize <<= 1;
  retry:
         buf_hash_table.ht_mask = hsize - 1;
+#if defined(_KERNEL) && defined(HAVE_SPL)
+       /*
+        * Large allocations which do not require contiguous pages
+        * should be using vmem_alloc() in the linux kernel
+        */
+       buf_hash_table.ht_table =
+           vmem_zalloc(hsize * sizeof (void*), KM_SLEEP);
+#else
         buf_hash_table.ht_table =
             kmem_zalloc(hsize * sizeof (void*), KM_NOSLEEP);
+#endif
         if (buf_hash_table.ht_table == NULL) {
                 ASSERT(hsize > (1ULL << 8));
                 hsize >>= 1;
@@ -884,7 +1017,7 @@ retry:
         }
  
         hdr_cache = kmem_cache_create("arc_buf_hdr_t", sizeof (arc_buf_hdr_t),
-           0, hdr_cons, hdr_dest, hdr_recl, NULL, NULL, 0);
+           0, hdr_cons, hdr_dest, NULL, NULL, NULL, 0);
         buf_cache = kmem_cache_create("arc_buf_t", sizeof (arc_buf_t),
             0, buf_cons, buf_dest, NULL, NULL, NULL, 0);
  
@@ -945,10 +1078,42 @@ arc_cksum_compute(arc_buf_t *buf, boolean_t force)
                 mutex_exit(&buf->b_hdr->b_freeze_lock);
                 return;
         }
-       buf->b_hdr->b_freeze_cksum = kmem_alloc(sizeof (zio_cksum_t), KM_SLEEP);
+       buf->b_hdr->b_freeze_cksum = kmem_alloc(sizeof (zio_cksum_t),
+           KM_PUSHPAGE);
         fletcher_2_native(buf->b_data, buf->b_hdr->b_size,
             buf->b_hdr->b_freeze_cksum);
         mutex_exit(&buf->b_hdr->b_freeze_lock);
+       arc_buf_watch(buf);
+}
+
+#ifndef _KERNEL
+void
+arc_buf_sigsegv(int sig, siginfo_t *si, void *unused)
+{
+       panic("Got SIGSEGV at address: 0x%lx\n", (long) si->si_addr);
+}
+#endif
+
+/* ARGSUSED */
+static void
+arc_buf_unwatch(arc_buf_t *buf)
+{
+#ifndef _KERNEL
+       if (arc_watch) {
+               ASSERT0(mprotect(buf->b_data, buf->b_hdr->b_size,
+                   PROT_READ | PROT_WRITE));
+       }
+#endif
+}
+
+/* ARGSUSED */
+static void
+arc_buf_watch(arc_buf_t *buf)
+{
+#ifndef _KERNEL
+       if (arc_watch)
+               ASSERT0(mprotect(buf->b_data, buf->b_hdr->b_size, PROT_READ));
+#endif
  }
  
  void
@@ -968,13 +1133,9 @@ arc_buf_thaw(arc_buf_t *buf)
                 buf->b_hdr->b_freeze_cksum = NULL;
         }
  
-       if (zfs_flags & ZFS_DEBUG_MODIFY) {
-               if (buf->b_hdr->b_thawed)
-                       kmem_free(buf->b_hdr->b_thawed, 1);
-               buf->b_hdr->b_thawed = kmem_alloc(1, KM_SLEEP);
-       }
-
         mutex_exit(&buf->b_hdr->b_freeze_lock);
+
+       arc_buf_unwatch(buf);
  }
  
  void
@@ -992,6 +1153,7 @@ arc_buf_freeze(arc_buf_t *buf)
             buf->b_hdr->b_state == arc_anon);
         arc_cksum_compute(buf, B_FALSE);
         mutex_exit(hash_lock);
+
  }
  
  static void
@@ -1010,7 +1172,7 @@ add_reference(arc_buf_hdr_t *ab, kmutex_t *hash_lock, void *tag)
                 ASSERT(list_link_active(&ab->b_arc_node));
                 list_remove(list, ab);
                 if (GHOST_STATE(ab->b_state)) {
-                       ASSERT3U(ab->b_datacnt, ==, 0);
+                       ASSERT0(ab->b_datacnt);
                         ASSERT3P(ab->b_buf, ==, NULL);
                         delta = ab->b_size;
                 }
@@ -1048,6 +1210,54 @@ remove_reference(arc_buf_hdr_t *ab, kmutex_t *hash_lock, void *tag)
         return (cnt);
  }
  
+/*
+ * Returns detailed information about a specific arc buffer.  When the
+ * state_index argument is set the function will calculate the arc header
+ * list position for its arc state.  Since this requires a linear traversal
+ * callers are strongly encourage not to do this.  However, it can be helpful
+ * for targeted analysis so the functionality is provided.
+ */
+void
+arc_buf_info(arc_buf_t *ab, arc_buf_info_t *abi, int state_index)
+{
+       arc_buf_hdr_t *hdr = ab->b_hdr;
+       arc_state_t *state = hdr->b_state;
+
+       memset(abi, 0, sizeof (arc_buf_info_t));
+       abi->abi_flags = hdr->b_flags;
+       abi->abi_datacnt = hdr->b_datacnt;
+       abi->abi_state_type = state ? state->arcs_state : ARC_STATE_ANON;
+       abi->abi_state_contents = hdr->b_type;
+       abi->abi_state_index = -1;
+       abi->abi_size = hdr->b_size;
+       abi->abi_access = hdr->b_arc_access;
+       abi->abi_mru_hits = hdr->b_mru_hits;
+       abi->abi_mru_ghost_hits = hdr->b_mru_ghost_hits;
+       abi->abi_mfu_hits = hdr->b_mfu_hits;
+       abi->abi_mfu_ghost_hits = hdr->b_mfu_ghost_hits;
+       abi->abi_holds = refcount_count(&hdr->b_refcnt);
+
+       if (hdr->b_l2hdr) {
+               abi->abi_l2arc_dattr = hdr->b_l2hdr->b_daddr;
+               abi->abi_l2arc_asize = hdr->b_l2hdr->b_asize;
+               abi->abi_l2arc_compress = hdr->b_l2hdr->b_compress;
+               abi->abi_l2arc_hits = hdr->b_l2hdr->b_hits;
+       }
+
+       if (state && state_index && list_link_active(&hdr->b_arc_node)) {
+               list_t *list = &state->arcs_list[hdr->b_type];
+               arc_buf_hdr_t *h;
+
+               mutex_enter(&state->arcs_mtx);
+               for (h = list_head(list); h != NULL; h = list_next(list, h)) {
+                       abi->abi_state_index++;
+                       if (h == hdr)
+                               break;
+               }
+               mutex_exit(&state->arcs_mtx);
+       }
+}
+
  /*
   * Move the supplied buffer to the indicated state.  The mutex
   * for the buffer must be held by the caller.
@@ -1060,7 +1270,7 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *ab, kmutex_t *hash_lock)
         uint64_t from_delta, to_delta;
  
         ASSERT(MUTEX_HELD(hash_lock));
-       ASSERT(new_state != old_state);
+       ASSERT3P(new_state, !=, old_state);
         ASSERT(refcnt == 0 || ab->b_datacnt > 0);
         ASSERT(ab->b_datacnt == 0 || !GHOST_STATE(new_state));
         ASSERT(ab->b_datacnt <= 1 || old_state != arc_anon);
@@ -1161,7 +1371,7 @@ arc_space_consume(uint64_t space, arc_space_type_t type)
                 break;
         }
  
-       atomic_add_64(&arc_meta_used, space);
+       ARCSTAT_INCR(arcstat_meta_used, space);
         atomic_add_64(&arc_size, space);
  }
  
@@ -1190,28 +1400,11 @@ arc_space_return(uint64_t space, arc_space_type_t type)
         ASSERT(arc_meta_used >= space);
         if (arc_meta_max < arc_meta_used)
                 arc_meta_max = arc_meta_used;
-       atomic_add_64(&arc_meta_used, -space);
+       ARCSTAT_INCR(arcstat_meta_used, -space);
         ASSERT(arc_size >= space);
         atomic_add_64(&arc_size, -space);
  }
  
-void *
-arc_data_buf_alloc(uint64_t size)
-{
-       if (arc_evict_needed(ARC_BUFC_DATA))
-               cv_signal(&arc_reclaim_thr_cv);
-       atomic_add_64(&arc_size, size);
-       return (zio_data_buf_alloc(size));
-}
-
-void
-arc_data_buf_free(void *buf, uint64_t size)
-{
-       zio_data_buf_free(buf, size);
-       ASSERT(arc_size >= size);
-       atomic_add_64(&arc_size, -size);
-}
-
  arc_buf_t *
  arc_buf_alloc(spa_t *spa, int size, void *tag, arc_buf_contents_t type)
  {
@@ -1223,9 +1416,14 @@ arc_buf_alloc(spa_t *spa, int size, void *tag, arc_buf_contents_t type)
         ASSERT(BUF_EMPTY(hdr));
         hdr->b_size = size;
         hdr->b_type = type;
-       hdr->b_spa = spa_guid(spa);
+       hdr->b_spa = spa_load_guid(spa);
         hdr->b_state = arc_anon;
         hdr->b_arc_access = 0;
+       hdr->b_mru_hits = 0;
+       hdr->b_mru_ghost_hits = 0;
+       hdr->b_mfu_hits = 0;
+       hdr->b_mfu_ghost_hits = 0;
+       hdr->b_l2_hits = 0;
         buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE);
         buf->b_hdr = hdr;
         buf->b_data = NULL;
@@ -1310,6 +1508,17 @@ arc_buf_clone(arc_buf_t *from)
         hdr->b_buf = buf;
         arc_get_data_buf(buf);
         bcopy(from->b_data, buf->b_data, size);
+
+       /*
+        * This buffer already exists in the arc so create a duplicate
+        * copy for the caller.  If the buffer is associated with user data
+        * then track the size and number of duplicates.  These stats will be
+        * updated as duplicate buffers are created and destroyed.
+        */
+       if (hdr->b_type == ARC_BUFC_DATA) {
+               ARCSTAT_BUMP(arcstat_duplicate_buffers);
+               ARCSTAT_INCR(arcstat_duplicate_buffers_size, size);
+       }
         hdr->b_datacnt += 1;
         return (buf);
  }
@@ -1352,21 +1561,22 @@ arc_buf_add_ref(arc_buf_t *buf, void* tag)
   * the buffer is placed on l2arc_free_on_write to be freed later.
   */
  static void
-arc_buf_data_free(arc_buf_hdr_t *hdr, void (*free_func)(void *, size_t),
-    void *data, size_t size)
+arc_buf_data_free(arc_buf_t *buf, void (*free_func)(void *, size_t))
  {
+       arc_buf_hdr_t *hdr = buf->b_hdr;
+
         if (HDR_L2_WRITING(hdr)) {
                 l2arc_data_free_t *df;
-               df = kmem_alloc(sizeof (l2arc_data_free_t), KM_SLEEP);
-               df->l2df_data = data;
-               df->l2df_size = size;
+               df = kmem_alloc(sizeof (l2arc_data_free_t), KM_PUSHPAGE);
+               df->l2df_data = buf->b_data;
+               df->l2df_size = hdr->b_size;
                 df->l2df_func = free_func;
                 mutex_enter(&l2arc_free_on_write_mtx);
                 list_insert_head(l2arc_free_on_write, df);
                 mutex_exit(&l2arc_free_on_write_mtx);
                 ARCSTAT_BUMP(arcstat_l2_free_on_write);
         } else {
-               free_func(data, size);
+               free_func(buf->b_data, hdr->b_size);
         }
  }
  
@@ -1382,16 +1592,15 @@ arc_buf_destroy(arc_buf_t *buf, boolean_t recycle, boolean_t all)
                 arc_buf_contents_t type = buf->b_hdr->b_type;
  
                 arc_cksum_verify(buf);
+               arc_buf_unwatch(buf);
  
                 if (!recycle) {
                         if (type == ARC_BUFC_METADATA) {
-                               arc_buf_data_free(buf->b_hdr, zio_buf_free,
-                                   buf->b_data, size);
+                               arc_buf_data_free(buf, zio_buf_free);
                                 arc_space_return(size, ARC_SPACE_DATA);
                         } else {
                                 ASSERT(type == ARC_BUFC_DATA);
-                               arc_buf_data_free(buf->b_hdr,
-                                   zio_data_buf_free, buf->b_data, size);
+                               arc_buf_data_free(buf, zio_data_buf_free);
                                 ARCSTAT_INCR(arcstat_data_size, -size);
                                 atomic_add_64(&arc_size, -size);
                         }
@@ -1408,6 +1617,16 @@ arc_buf_destroy(arc_buf_t *buf, boolean_t recycle, boolean_t all)
                 ASSERT3U(state->arcs_size, >=, size);
                 atomic_add_64(&state->arcs_size, -size);
                 buf->b_data = NULL;
+
+               /*
+                * If we're destroying a duplicate buffer make sure
+                * that the appropriate statistics are updated.
+                */
+               if (buf->b_hdr->b_datacnt > 1 &&
+                   buf->b_hdr->b_type == ARC_BUFC_DATA) {
+                       ARCSTAT_BUMPDOWN(arcstat_duplicate_buffers);
+                       ARCSTAT_INCR(arcstat_duplicate_buffers_size, -size);
+               }
                 ASSERT(buf->b_hdr->b_datacnt > 0);
                 buf->b_hdr->b_datacnt -= 1;
         }
@@ -1458,7 +1677,9 @@ arc_hdr_destroy(arc_buf_hdr_t *hdr)
                 if (l2hdr != NULL) {
                         list_remove(l2hdr->b_dev->l2ad_buflist, hdr);
                         ARCSTAT_INCR(arcstat_l2_size, -hdr->b_size);
+                       ARCSTAT_INCR(arcstat_l2_asize, -l2hdr->b_asize);
                         kmem_free(l2hdr, sizeof (l2arc_buf_hdr_t));
+                       arc_space_return(L2HDR_SIZE, ARC_SPACE_L2HDRS);
                         if (hdr->b_state == arc_l2c_only)
                                 l2arc_hdr_stat_remove();
                         hdr->b_l2hdr = NULL;
@@ -1494,10 +1715,6 @@ arc_hdr_destroy(arc_buf_hdr_t *hdr)
                 kmem_free(hdr->b_freeze_cksum, sizeof (zio_cksum_t));
                 hdr->b_freeze_cksum = NULL;
         }
-       if (hdr->b_thawed) {
-               kmem_free(hdr->b_thawed, 1);
-               hdr->b_thawed = NULL;
-       }
  
         ASSERT(!list_link_active(&hdr->b_arc_node));
         ASSERT3P(hdr->b_hash_next, ==, NULL);
@@ -1552,12 +1769,12 @@ arc_buf_free(arc_buf_t *buf, void *tag)
         }
  }
  
-int
+boolean_t
  arc_buf_remove_ref(arc_buf_t *buf, void* tag)
  {
         arc_buf_hdr_t *hdr = buf->b_hdr;
-       kmutex_t *hash_lock = HDR_LOCK(hdr);
-       int no_callback = (buf->b_efunc == NULL);
+       kmutex_t *hash_lock = NULL;
+       boolean_t no_callback = (buf->b_efunc == NULL);
  
         if (hdr->b_state == arc_anon) {
                 ASSERT(hdr->b_datacnt == 1);
@@ -1565,6 +1782,7 @@ arc_buf_remove_ref(arc_buf_t *buf, void* tag)
                 return (no_callback);
         }
  
+       hash_lock = HDR_LOCK(hdr);
         mutex_enter(hash_lock);
         hdr = buf->b_hdr;
         ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
@@ -1592,6 +1810,48 @@ arc_buf_size(arc_buf_t *buf)
         return (buf->b_hdr->b_size);
  }
  
+/*
+ * Called from the DMU to determine if the current buffer should be
+ * evicted. In order to ensure proper locking, the eviction must be initiated
+ * from the DMU. Return true if the buffer is associated with user data and
+ * duplicate buffers still exist.
+ */
+boolean_t
+arc_buf_eviction_needed(arc_buf_t *buf)
+{
+       arc_buf_hdr_t *hdr;
+       boolean_t evict_needed = B_FALSE;
+
+       if (zfs_disable_dup_eviction)
+               return (B_FALSE);
+
+       mutex_enter(&buf->b_evict_lock);
+       hdr = buf->b_hdr;
+       if (hdr == NULL) {
+               /*
+                * We are in arc_do_user_evicts(); let that function
+                * perform the eviction.
+                */
+               ASSERT(buf->b_data == NULL);
+               mutex_exit(&buf->b_evict_lock);
+               return (B_FALSE);
+       } else if (buf->b_data == NULL) {
+               /*
+                * We have already been added to the arc eviction list;
+                * recommend eviction.
+                */
+               ASSERT3P(hdr, ==, &arc_eviction_hdr);
+               mutex_exit(&buf->b_evict_lock);
+               return (B_TRUE);
+       }
+
+       if (hdr->b_datacnt > 1 && hdr->b_type == ARC_BUFC_DATA)
+               evict_needed = B_TRUE;
+
+       mutex_exit(&buf->b_evict_lock);
+       return (evict_needed);
+}
+
  /*
   * Evict buffers from list until we've removed the specified number of
   * bytes.  Move the removed buffers to the appropriate evict state.
@@ -1616,6 +1876,8 @@ arc_evict(arc_state_t *state, uint64_t spa, int64_t bytes, boolean_t recycle,
         kmutex_t *hash_lock;
         boolean_t have_lock;
         void *stolen = NULL;
+       arc_buf_hdr_t marker = {{{ 0 }}};
+       int count = 0;
  
         ASSERT(state == arc_mru || state == arc_mfu);
  
@@ -1631,7 +1893,7 @@ arc_evict(arc_state_t *state, uint64_t spa, int64_t bytes, boolean_t recycle,
                     (spa && ab->b_spa != spa) ||
                     (ab->b_flags & (ARC_PREFETCH|ARC_INDIRECT) &&
                     ddi_get_lbolt() - ab->b_arc_access <
-                   arc_min_prefetch_lifespan)) {
+                   zfs_arc_min_prefetch_lifespan)) {
                         skipped++;
                         continue;
                 }
@@ -1639,10 +1901,37 @@ arc_evict(arc_state_t *state, uint64_t spa, int64_t bytes, boolean_t recycle,
                 if (recycle && ab->b_size != bytes &&
                     ab_prev && ab_prev->b_size == bytes)
                         continue;
+
+               /* ignore markers */
+               if (ab->b_spa == 0)
+                       continue;
+
+               /*
+                * It may take a long time to evict all the bufs requested.
+                * To avoid blocking all arc activity, periodically drop
+                * the arcs_mtx and give other threads a chance to run
+                * before reacquiring the lock.
+                *
+                * If we are looking for a buffer to recycle, we are in
+                * the hot code path, so don't sleep.
+                */
+               if (!recycle && count++ > arc_evict_iterations) {
+                       list_insert_after(list, ab, &marker);
+                       mutex_exit(&evicted_state->arcs_mtx);
+                       mutex_exit(&state->arcs_mtx);
+                       kpreempt(KPREEMPT_SYNC);
+                       mutex_enter(&state->arcs_mtx);
+                       mutex_enter(&evicted_state->arcs_mtx);
+                       ab_prev = list_prev(list, &marker);
+                       list_remove(list, &marker);
+                       count = 0;
+                       continue;
+               }
+
                 hash_lock = HDR_LOCK(ab);
                 have_lock = MUTEX_HELD(hash_lock);
                 if (have_lock || mutex_tryenter(hash_lock)) {
-                       ASSERT3U(refcount_count(&ab->b_refcnt), ==, 0);
+                       ASSERT0(refcount_count(&ab->b_refcnt));
                         ASSERT(ab->b_datacnt > 0);
                         while (ab->b_buf) {
                                 arc_buf_t *buf = ab->b_buf;
@@ -1710,7 +1999,7 @@ arc_evict(arc_state_t *state, uint64_t spa, int64_t bytes, boolean_t recycle,
         mutex_exit(&state->arcs_mtx);
  
         if (bytes_evicted < bytes)
-               dprintf("only evicted %lld bytes from %x",
+               dprintf("only evicted %lld bytes from %x\n",
                     (longlong_t)bytes_evicted, state);
  
         if (skipped)
@@ -1720,25 +2009,11 @@ arc_evict(arc_state_t *state, uint64_t spa, int64_t bytes, boolean_t recycle,
                 ARCSTAT_INCR(arcstat_mutex_miss, missed);
  
         /*
-        * We have just evicted some date into the ghost state, make
-        * sure we also adjust the ghost state size if necessary.
+        * Note: we have just evicted some data into the ghost state,
+        * potentially putting the ghost size over the desired size.  Rather
+        * that evicting from the ghost list in this hot code path, leave
+        * this chore to the arc_reclaim_thread().
          */
-       if (arc_no_grow &&
-           arc_mru_ghost->arcs_size + arc_mfu_ghost->arcs_size > arc_c) {
-               int64_t mru_over = arc_anon->arcs_size + arc_mru->arcs_size +
-                   arc_mru_ghost->arcs_size - arc_c;
-
-               if (mru_over > 0 && arc_mru_ghost->arcs_lsize[type] > 0) {
-                       int64_t todelete =
-                           MIN(arc_mru_ghost->arcs_lsize[type], mru_over);
-                       arc_evict_ghost(arc_mru_ghost, 0, todelete);
-               } else if (arc_mfu_ghost->arcs_lsize[type] > 0) {
-                       int64_t todelete = MIN(arc_mfu_ghost->arcs_lsize[type],
-                           arc_mru_ghost->arcs_size +
-                           arc_mfu_ghost->arcs_size - arc_c);
-                       arc_evict_ghost(arc_mfu_ghost, 0, todelete);
-               }
-       }
  
         return (stolen);
  }
@@ -1748,21 +2023,25 @@ arc_evict(arc_state_t *state, uint64_t spa, int64_t bytes, boolean_t recycle,
   * bytes.  Destroy the buffers that are removed.
   */
  static void
-arc_evict_ghost(arc_state_t *state, uint64_t spa, int64_t bytes)
+arc_evict_ghost(arc_state_t *state, uint64_t spa, int64_t bytes,
+    arc_buf_contents_t type)
  {
         arc_buf_hdr_t *ab, *ab_prev;
         arc_buf_hdr_t marker;
-       list_t *list = &state->arcs_list[ARC_BUFC_DATA];
+       list_t *list = &state->arcs_list[type];
         kmutex_t *hash_lock;
         uint64_t bytes_deleted = 0;
         uint64_t bufs_skipped = 0;
+       int count = 0;
  
         ASSERT(GHOST_STATE(state));
-       bzero(&marker, sizeof(marker));
+       bzero(&marker, sizeof (marker));
  top:
         mutex_enter(&state->arcs_mtx);
         for (ab = list_tail(list); ab; ab = ab_prev) {
                 ab_prev = list_prev(list, ab);
+               if (ab->b_type > ARC_BUFC_NUMTYPES)
+                       panic("invalid ab=%p", (void *)ab);
                 if (spa && ab->b_spa != spa)
                         continue;
  
@@ -1774,6 +2053,23 @@ top:
                 /* caller may be trying to modify this buffer, skip it */
                 if (MUTEX_HELD(hash_lock))
                         continue;
+
+               /*
+                * It may take a long time to evict all the bufs requested.
+                * To avoid blocking all arc activity, periodically drop
+                * the arcs_mtx and give other threads a chance to run
+                * before reacquiring the lock.
+                */
+               if (count++ > arc_evict_iterations) {
+                       list_insert_after(list, ab, &marker);
+                       mutex_exit(&state->arcs_mtx);
+                       kpreempt(KPREEMPT_SYNC);
+                       mutex_enter(&state->arcs_mtx);
+                       ab_prev = list_prev(list, &marker);
+                       list_remove(list, &marker);
+                       count = 0;
+                       continue;
+               }
                 if (mutex_tryenter(hash_lock)) {
                         ASSERT(!HDR_IO_IN_PROGRESS(ab));
                         ASSERT(ab->b_buf == NULL);
@@ -1809,8 +2105,9 @@ top:
                         mutex_enter(&state->arcs_mtx);
                         ab_prev = list_prev(list, &marker);
                         list_remove(list, &marker);
-               } else
+               } else {
                         bufs_skipped += 1;
+               }
         }
         mutex_exit(&state->arcs_mtx);
  
@@ -1826,7 +2123,7 @@ top:
         }
  
         if (bytes_deleted < bytes)
-               dprintf("only deleted %lld bytes from %p",
+               dprintf("only deleted %lld bytes from %p\n",
                     (longlong_t)bytes_deleted, state);
  }
  
@@ -1882,7 +2179,7 @@ arc_adjust(void)
  
         if (adjustment > 0 && arc_mru_ghost->arcs_size > 0) {
                 delta = MIN(arc_mru_ghost->arcs_size, adjustment);
-               arc_evict_ghost(arc_mru_ghost, 0, delta);
+               arc_evict_ghost(arc_mru_ghost, 0, delta, ARC_BUFC_DATA);
         }
  
         adjustment =
@@ -1890,10 +2187,52 @@ arc_adjust(void)
  
         if (adjustment > 0 && arc_mfu_ghost->arcs_size > 0) {
                 delta = MIN(arc_mfu_ghost->arcs_size, adjustment);
-               arc_evict_ghost(arc_mfu_ghost, 0, delta);
+               arc_evict_ghost(arc_mfu_ghost, 0, delta, ARC_BUFC_DATA);
         }
  }
  
+/*
+ * Request that arc user drop references so that N bytes can be released
+ * from the cache.  This provides a mechanism to ensure the arc can honor
+ * the arc_meta_limit and reclaim buffers which are pinned in the cache
+ * by higher layers.  (i.e. the zpl)
+ */
+static void
+arc_do_user_prune(int64_t adjustment)
+{
+       arc_prune_func_t *func;
+       void *private;
+       arc_prune_t *cp, *np;
+
+       mutex_enter(&arc_prune_mtx);
+
+       cp = list_head(&arc_prune_list);
+       while (cp != NULL) {
+               func = cp->p_pfunc;
+               private = cp->p_private;
+               np = list_next(&arc_prune_list, cp);
+               refcount_add(&cp->p_refcnt, func);
+               mutex_exit(&arc_prune_mtx);
+
+               if (func != NULL)
+                       func(adjustment, private);
+
+               mutex_enter(&arc_prune_mtx);
+
+               /* User removed prune callback concurrently with execution */
+               if (refcount_remove(&cp->p_refcnt, func) == 0) {
+                       ASSERT(!list_link_active(&cp->p_node));
+                       refcount_destroy(&cp->p_refcnt);
+                       kmem_free(cp, sizeof (*cp));
+               }
+
+               cp = np;
+       }
+
+       ARCSTAT_BUMP(arcstat_prune);
+       mutex_exit(&arc_prune_mtx);
+}
+
  static void
  arc_do_user_evicts(void)
  {
@@ -1917,6 +2256,32 @@ arc_do_user_evicts(void)
         mutex_exit(&arc_eviction_mtx);
  }
  
+/*
+ * Evict only meta data objects from the cache leaving the data objects.
+ * This is only used to enforce the tunable arc_meta_limit, if we are
+ * unable to evict enough buffers notify the user via the prune callback.
+ */
+void
+arc_adjust_meta(int64_t adjustment, boolean_t may_prune)
+{
+       int64_t delta;
+
+       if (adjustment > 0 && arc_mru->arcs_lsize[ARC_BUFC_METADATA] > 0) {
+               delta = MIN(arc_mru->arcs_lsize[ARC_BUFC_METADATA], adjustment);
+               arc_evict(arc_mru, 0, delta, FALSE, ARC_BUFC_METADATA);
+               adjustment -= delta;
+       }
+
+       if (adjustment > 0 && arc_mfu->arcs_lsize[ARC_BUFC_METADATA] > 0) {
+               delta = MIN(arc_mfu->arcs_lsize[ARC_BUFC_METADATA], adjustment);
+               arc_evict(arc_mfu, 0, delta, FALSE, ARC_BUFC_METADATA);
+               adjustment -= delta;
+       }
+
+       if (may_prune && (adjustment > 0) && (arc_meta_used > arc_meta_limit))
+               arc_do_user_prune(zfs_arc_meta_prune);
+}
+
  /*
   * Flush all *evictable* data from the cache for the given spa.
   * NOTE: this will not touch "active" (i.e. referenced) data.
@@ -1927,7 +2292,7 @@ arc_flush(spa_t *spa)
         uint64_t guid = 0;
  
         if (spa)
-               guid = spa_guid(spa);
+               guid = spa_load_guid(spa);
  
         while (list_head(&arc_mru->arcs_list[ARC_BUFC_DATA])) {
                 (void) arc_evict(arc_mru, guid, -1, FALSE, ARC_BUFC_DATA);
@@ -1950,8 +2315,8 @@ arc_flush(spa_t *spa)
                         break;
         }
  
-       arc_evict_ghost(arc_mru_ghost, guid, -1);
-       arc_evict_ghost(arc_mfu_ghost, guid, -1);
+       arc_evict_ghost(arc_mru_ghost, guid, -1, ARC_BUFC_DATA);
+       arc_evict_ghost(arc_mfu_ghost, guid, -1, ARC_BUFC_DATA);
  
         mutex_enter(&arc_reclaim_thr_lock);
         arc_do_user_evicts();
@@ -1960,22 +2325,19 @@ arc_flush(spa_t *spa)
  }
  
  void
-arc_shrink(void)
+arc_shrink(uint64_t bytes)
  {
         if (arc_c > arc_c_min) {
                 uint64_t to_free;
  
-#ifdef _KERNEL
-               to_free = MAX(arc_c >> arc_shrink_shift, ptob(needfree));
-#else
-               to_free = arc_c >> arc_shrink_shift;
-#endif
+               to_free = bytes ? bytes : arc_c >> zfs_arc_shrink_shift;
+
                 if (arc_c > arc_c_min + to_free)
                         atomic_add_64(&arc_c, -to_free);
                 else
                         arc_c = arc_c_min;
  
-               atomic_add_64(&arc_p, -(arc_p >> arc_shrink_shift));
+               atomic_add_64(&arc_p, -(arc_p >> zfs_arc_shrink_shift));
                 if (arc_c > arc_size)
                         arc_c = MAX(arc_size, arc_c_min);
                 if (arc_p > arc_c)
@@ -1988,66 +2350,8 @@ arc_shrink(void)
                 arc_adjust();
  }
  
-static int
-arc_reclaim_needed(void)
-{
-#ifdef _KERNEL
-       uint64_t extra;
-
-       if (needfree)
-               return (1);
-
-       /*
-        * take 'desfree' extra pages, so we reclaim sooner, rather than later
-        */
-       extra = desfree;
-
-       /*
-        * check that we're out of range of the pageout scanner.  It starts to
-        * schedule paging if freemem is less than lotsfree and needfree.
-        * lotsfree is the high-water mark for pageout, and needfree is the
-        * number of needed free pages.  We add extra pages here to make sure
-        * the scanner doesn't start up while we're freeing memory.
-        */
-       if (freemem < lotsfree + needfree + extra)
-               return (1);
-
-       /*
-        * check to make sure that swapfs has enough space so that anon
-        * reservations can still succeed. anon_resvmem() checks that the
-        * availrmem is greater than swapfs_minfree, and the number of reserved
-        * swap pages.  We also add a bit of extra here just to prevent
-        * circumstances from getting really dire.
-        */
-       if (availrmem < swapfs_minfree + swapfs_reserve + extra)
-               return (1);
-
-#if defined(__i386)
-       /*
-        * If we're on an i386 platform, it's possible that we'll exhaust the
-        * kernel heap space before we ever run out of available physical
-        * memory.  Most checks of the size of the heap_area compare against
-        * tune.t_minarmem, which is the minimum available real memory that we
-        * can have in the system.  However, this is generally fixed at 25 pages
-        * which is so low that it's useless.  In this comparison, we seek to
-        * calculate the total heap-size, and reclaim if more than 3/4ths of the
-        * heap is allocated.  (Or, in the calculation, if less than 1/4th is
-        * free)
-        */
-       if (btop(vmem_size(heap_arena, VMEM_FREE)) <
-           (btop(vmem_size(heap_arena, VMEM_FREE | VMEM_ALLOC)) >> 2))
-               return (1);
-#endif
-
-#else
-       if (spa_get_random(100) == 0)
-               return (1);
-#endif
-       return (0);
-}
-
  static void
-arc_kmem_reap_now(arc_reclaim_strategy_t strat)
+arc_kmem_reap_now(arc_reclaim_strategy_t strat, uint64_t bytes)
  {
         size_t                  i;
         kmem_cache_t            *prev_cache = NULL;
@@ -2055,28 +2359,12 @@ arc_kmem_reap_now(arc_reclaim_strategy_t strat)
         extern kmem_cache_t     *zio_buf_cache[];
         extern kmem_cache_t     *zio_data_buf_cache[];
  
-#ifdef _KERNEL
-       if (arc_meta_used >= arc_meta_limit) {
-               /*
-                * We are exceeding our meta-data cache limit.
-                * Purge some DNLC entries to release holds on meta-data.
-                */
-               dnlc_reduce_cache((void *)(uintptr_t)arc_reduce_dnlc_percent);
-       }
-#if defined(__i386)
-       /*
-        * Reclaim unused memory from all kmem caches.
-        */
-       kmem_reap();
-#endif
-#endif
-
         /*
          * An aggressive reclamation will shrink the cache size as well as
          * reap free buffers from the arc kmem caches.
          */
         if (strat == ARC_RECLAIM_AGGR)
-               arc_shrink();
+               arc_shrink(bytes);
  
         for (i = 0; i < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; i++) {
                 if (zio_buf_cache[i] != prev_cache) {
@@ -2088,22 +2376,32 @@ arc_kmem_reap_now(arc_reclaim_strategy_t strat)
                         kmem_cache_reap_now(zio_data_buf_cache[i]);
                 }
         }
+
         kmem_cache_reap_now(buf_cache);
         kmem_cache_reap_now(hdr_cache);
  }
  
+/*
+ * Unlike other ZFS implementations this thread is only responsible for
+ * adapting the target ARC size on Linux.  The responsibility for memory
+ * reclamation has been entirely delegated to the arc_shrinker_func()
+ * which is registered with the VM.  To reflect this change in behavior
+ * the arc_reclaim thread has been renamed to arc_adapt.
+ */
  static void
-arc_reclaim_thread(void)
+arc_adapt_thread(void)
  {
-       clock_t                 growtime = 0;
-       arc_reclaim_strategy_t  last_reclaim = ARC_RECLAIM_CONS;
         callb_cpr_t             cpr;
+       int64_t                 prune;
  
         CALLB_CPR_INIT(&cpr, &arc_reclaim_thr_lock, callb_generic_cpr, FTAG);
  
         mutex_enter(&arc_reclaim_thr_lock);
         while (arc_thread_exit == 0) {
-               if (arc_reclaim_needed()) {
+#ifndef _KERNEL
+               arc_reclaim_strategy_t  last_reclaim = ARC_RECLAIM_CONS;
+
+               if (spa_get_random(100) == 0) {
  
                         if (arc_no_grow) {
                                 if (last_reclaim == ARC_RECLAIM_CONS) {
@@ -2118,14 +2416,26 @@ arc_reclaim_thread(void)
                         }
  
                         /* reset the growth delay for every reclaim */
-                       growtime = ddi_get_lbolt() + (arc_grow_retry * hz);
+                       arc_grow_time = ddi_get_lbolt() +
+                           (zfs_arc_grow_retry * hz);
  
-                       arc_kmem_reap_now(last_reclaim);
+                       arc_kmem_reap_now(last_reclaim, 0);
                         arc_warm = B_TRUE;
+               }
+#endif /* !_KERNEL */
  
-               } else if (arc_no_grow && ddi_get_lbolt() >= growtime) {
+               /* No recent memory pressure allow the ARC to grow. */
+               if (arc_no_grow && ddi_get_lbolt() >= arc_grow_time)
                         arc_no_grow = FALSE;
-               }
+
+               /*
+                * Keep meta data usage within limits, arc_shrink() is not
+                * used to avoid collapsing the arc_c value when only the
+                * arc_meta_limit is being exceeded.
+                */
+               prune = (int64_t)arc_meta_used - (int64_t)arc_meta_limit;
+               if (prune > 0)
+                       arc_adjust_meta(prune, B_TRUE);
  
                 arc_adjust();
  
@@ -2134,9 +2444,29 @@ arc_reclaim_thread(void)
  
                 /* block until needed, or one second, whichever is shorter */
                 CALLB_CPR_SAFE_BEGIN(&cpr);
-               (void) cv_timedwait(&arc_reclaim_thr_cv,
+               (void) cv_timedwait_interruptible(&arc_reclaim_thr_cv,
                     &arc_reclaim_thr_lock, (ddi_get_lbolt() + hz));
                 CALLB_CPR_SAFE_END(&cpr, &arc_reclaim_thr_lock);
+
+
+               /* Allow the module options to be changed */
+               if (zfs_arc_max > 64 << 20 &&
+                   zfs_arc_max < physmem * PAGESIZE &&
+                   zfs_arc_max != arc_c_max)
+                       arc_c_max = zfs_arc_max;
+
+               if (zfs_arc_min > 0 &&
+                   zfs_arc_min < arc_c_max &&
+                   zfs_arc_min != arc_c_min)
+                       arc_c_min = zfs_arc_min;
+
+               if (zfs_arc_meta_limit > 0 &&
+                   zfs_arc_meta_limit <= arc_c_max &&
+                   zfs_arc_meta_limit != arc_meta_limit)
+                       arc_meta_limit = zfs_arc_meta_limit;
+
+
+
         }
  
         arc_thread_exit = 0;
@@ -2145,6 +2475,129 @@ arc_reclaim_thread(void)
         thread_exit();
  }
  
+#ifdef _KERNEL
+/*
+ * Determine the amount of memory eligible for eviction contained in the
+ * ARC. All clean data reported by the ghost lists can always be safely
+ * evicted. Due to arc_c_min, the same does not hold for all clean data
+ * contained by the regular mru and mfu lists.
+ *
+ * In the case of the regular mru and mfu lists, we need to report as
+ * much clean data as possible, such that evicting that same reported
+ * data will not bring arc_size below arc_c_min. Thus, in certain
+ * circumstances, the total amount of clean data in the mru and mfu
+ * lists might not actually be evictable.
+ *
+ * The following two distinct cases are accounted for:
+ *
+ * 1. The sum of the amount of dirty data contained by both the mru and
+ *    mfu lists, plus the ARC's other accounting (e.g. the anon list),
+ *    is greater than or equal to arc_c_min.
+ *    (i.e. amount of dirty data >= arc_c_min)
+ *
+ *    This is the easy case; all clean data contained by the mru and mfu
+ *    lists is evictable. Evicting all clean data can only drop arc_size
+ *    to the amount of dirty data, which is greater than arc_c_min.
+ *
+ * 2. The sum of the amount of dirty data contained by both the mru and
+ *    mfu lists, plus the ARC's other accounting (e.g. the anon list),
+ *    is less than arc_c_min.
+ *    (i.e. arc_c_min > amount of dirty data)
+ *
+ *    2.1. arc_size is greater than or equal arc_c_min.
+ *         (i.e. arc_size >= arc_c_min > amount of dirty data)
+ *
+ *         In this case, not all clean data from the regular mru and mfu
+ *         lists is actually evictable; we must leave enough clean data
+ *         to keep arc_size above arc_c_min. Thus, the maximum amount of
+ *         evictable data from the two lists combined, is exactly the
+ *         difference between arc_size and arc_c_min.
+ *
+ *    2.2. arc_size is less than arc_c_min
+ *         (i.e. arc_c_min > arc_size > amount of dirty data)
+ *
+ *         In this case, none of the data contained in the mru and mfu
+ *         lists is evictable, even if it's clean. Since arc_size is
+ *         already below arc_c_min, evicting any more would only
+ *         increase this negative difference.
+ */
+static uint64_t
+arc_evictable_memory(void) {
+       uint64_t arc_clean =
+           arc_mru->arcs_lsize[ARC_BUFC_DATA] +
+           arc_mru->arcs_lsize[ARC_BUFC_METADATA] +
+           arc_mfu->arcs_lsize[ARC_BUFC_DATA] +
+           arc_mfu->arcs_lsize[ARC_BUFC_METADATA];
+       uint64_t ghost_clean =
+           arc_mru_ghost->arcs_lsize[ARC_BUFC_DATA] +
+           arc_mru_ghost->arcs_lsize[ARC_BUFC_METADATA] +
+           arc_mfu_ghost->arcs_lsize[ARC_BUFC_DATA] +
+           arc_mfu_ghost->arcs_lsize[ARC_BUFC_METADATA];
+       uint64_t arc_dirty = MAX((int64_t)arc_size - (int64_t)arc_clean, 0);
+
+       if (arc_dirty >= arc_c_min)
+               return (ghost_clean + arc_clean);
+
+       return (ghost_clean + MAX((int64_t)arc_size - (int64_t)arc_c_min, 0));
+}
+
+static int
+__arc_shrinker_func(struct shrinker *shrink, struct shrink_control *sc)
+{
+       uint64_t pages;
+
+       /* The arc is considered warm once reclaim has occurred */
+       if (unlikely(arc_warm == B_FALSE))
+               arc_warm = B_TRUE;
+
+       /* Return the potential number of reclaimable pages */
+       pages = btop(arc_evictable_memory());
+       if (sc->nr_to_scan == 0)
+               return (pages);
+
+       /* Not allowed to perform filesystem reclaim */
+       if (!(sc->gfp_mask & __GFP_FS))
+               return (-1);
+
+       /* Reclaim in progress */
+       if (mutex_tryenter(&arc_reclaim_thr_lock) == 0)
+               return (-1);
+
+       /*
+        * Evict the requested number of pages by shrinking arc_c the
+        * requested amount.  If there is nothing left to evict just
+        * reap whatever we can from the various arc slabs.
+        */
+       if (pages > 0) {
+               arc_kmem_reap_now(ARC_RECLAIM_AGGR, ptob(sc->nr_to_scan));
+       } else {
+               arc_kmem_reap_now(ARC_RECLAIM_CONS, ptob(sc->nr_to_scan));
+       }
+
+       /*
+        * When direct reclaim is observed it usually indicates a rapid
+        * increase in memory pressure.  This occurs because the kswapd
+        * threads were unable to asynchronously keep enough free memory
+        * available.  In this case set arc_no_grow to briefly pause arc
+        * growth to avoid compounding the memory pressure.
+        */
+       if (current_is_kswapd()) {
+               ARCSTAT_BUMP(arcstat_memory_indirect_count);
+       } else {
+               arc_no_grow = B_TRUE;
+               arc_grow_time = ddi_get_lbolt() + (zfs_arc_grow_retry * hz);
+               ARCSTAT_BUMP(arcstat_memory_direct_count);
+       }
+
+       mutex_exit(&arc_reclaim_thr_lock);
+
+       return (-1);
+}
+SPL_SHRINKER_CALLBACK_WRAPPER(arc_shrinker_func);
+
+SPL_SHRINKER_DECLARE(arc_shrinker, arc_shrinker_func, DEFAULT_SEEKS);
+#endif /* _KERNEL */
+
  /*
   * Adapt arc info given the number of bytes we are trying to add and
   * the state that we are comming from.  This function is only called
@@ -2154,7 +2607,7 @@ static void
  arc_adapt(int bytes, arc_state_t *state)
  {
         int mult;
-       uint64_t arc_p_min = (arc_c >> arc_p_min_shift);
+       uint64_t arc_p_min = (arc_c >> zfs_arc_p_min_shift);
  
         if (state == arc_l2c_only)
                 return;
@@ -2186,11 +2639,6 @@ arc_adapt(int bytes, arc_state_t *state)
         }
         ASSERT((int64_t)arc_p >= 0);
  
-       if (arc_reclaim_needed()) {
-               cv_signal(&arc_reclaim_thr_cv);
-               return;
-       }
-
         if (arc_no_grow)
                 return;
  
@@ -2223,19 +2671,7 @@ arc_evict_needed(arc_buf_contents_t type)
         if (type == ARC_BUFC_METADATA && arc_meta_used >= arc_meta_limit)
                 return (1);
  
-#ifdef _KERNEL
-       /*
-        * If zio data pages are being allocated out of a separate heap segment,
-        * then enforce that the size of available vmem for this area remains
-        * above about 1/32nd free.
-        */
-       if (type == ARC_BUFC_DATA && zio_arena != NULL &&
-           vmem_size(zio_arena, VMEM_FREE) <
-           (vmem_size(zio_arena, VMEM_ALLOC) >> 5))
-               return (1);
-#endif
-
-       if (arc_reclaim_needed())
+       if (arc_no_grow)
                 return (1);
  
         return (arc_size > arc_c);
@@ -2309,16 +2745,27 @@ arc_get_data_buf(arc_buf_t *buf)
                 state =  (arc_mru->arcs_lsize[type] >= size &&
                     mfu_space > arc_mfu->arcs_size) ? arc_mru : arc_mfu;
         }
+
         if ((buf->b_data = arc_evict(state, 0, size, TRUE, type)) == NULL) {
                 if (type == ARC_BUFC_METADATA) {
                         buf->b_data = zio_buf_alloc(size);
                         arc_space_consume(size, ARC_SPACE_DATA);
+
+                       /*
+                        * If we are unable to recycle an existing meta buffer
+                        * signal the reclaim thread.  It will notify users
+                        * via the prune callback to drop references.  The
+                        * prune callback in run in the context of the reclaim
+                        * thread to avoid deadlocking on the hash_lock.
+                        */
+                       cv_signal(&arc_reclaim_thr_cv);
                 } else {
                         ASSERT(type == ARC_BUFC_DATA);
                         buf->b_data = zio_data_buf_alloc(size);
                         ARCSTAT_INCR(arcstat_data_size, size);
                         atomic_add_64(&arc_size, size);
                 }
+
                 ARCSTAT_BUMP(arcstat_recycle_miss);
         }
         ASSERT(buf->b_data != NULL);
@@ -2384,6 +2831,7 @@ arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock)
                                 ASSERT(list_link_active(&buf->b_arc_node));
                         } else {
                                 buf->b_flags &= ~ARC_PREFETCH;
+                               atomic_inc_32(&buf->b_mru_hits);
                                 ARCSTAT_BUMP(arcstat_mru_hits);
                         }
                         buf->b_arc_access = now;
@@ -2405,6 +2853,7 @@ arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock)
                         DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf);
                         arc_change_state(arc_mfu, buf, hash_lock);
                 }
+               atomic_inc_32(&buf->b_mru_hits);
                 ARCSTAT_BUMP(arcstat_mru_hits);
         } else if (buf->b_state == arc_mru_ghost) {
                 arc_state_t     *new_state;
@@ -2427,6 +2876,7 @@ arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock)
                 buf->b_arc_access = ddi_get_lbolt();
                 arc_change_state(new_state, buf, hash_lock);
  
+               atomic_inc_32(&buf->b_mru_ghost_hits);
                 ARCSTAT_BUMP(arcstat_mru_ghost_hits);
         } else if (buf->b_state == arc_mfu) {
                 /*
@@ -2442,6 +2892,7 @@ arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock)
                         ASSERT(refcount_count(&buf->b_refcnt) == 0);
                         ASSERT(list_link_active(&buf->b_arc_node));
                 }
+               atomic_inc_32(&buf->b_mfu_hits);
                 ARCSTAT_BUMP(arcstat_mfu_hits);
                 buf->b_arc_access = ddi_get_lbolt();
         } else if (buf->b_state == arc_mfu_ghost) {
@@ -2457,7 +2908,7 @@ arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock)
                          * This is a prefetch access...
                          * move this block back to the MRU state.
                          */
-                       ASSERT3U(refcount_count(&buf->b_refcnt), ==, 0);
+                       ASSERT0(refcount_count(&buf->b_refcnt));
                         new_state = arc_mru;
                 }
  
@@ -2465,6 +2916,7 @@ arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock)
                 DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf);
                 arc_change_state(new_state, buf, hash_lock);
  
+               atomic_inc_32(&buf->b_mfu_ghost_hits);
                 ARCSTAT_BUMP(arcstat_mfu_ghost_hits);
         } else if (buf->b_state == arc_l2c_only) {
                 /*
@@ -2486,7 +2938,7 @@ arc_bcopy_func(zio_t *zio, arc_buf_t *buf, void *arg)
  {
         if (zio == NULL || zio->io_error == 0)
                 bcopy(buf->b_data, arg, buf->b_hdr->b_size);
-       VERIFY(arc_buf_remove_ref(buf, arg) == 1);
+       VERIFY(arc_buf_remove_ref(buf, arg));
  }
  
  /* a generic arc_done_func_t */
@@ -2495,7 +2947,7 @@ arc_getbuf_func(zio_t *zio, arc_buf_t *buf, void *arg)
  {
         arc_buf_t **bufp = arg;
         if (zio && zio->io_error) {
-               VERIFY(arc_buf_remove_ref(buf, arg) == 1);
+               VERIFY(arc_buf_remove_ref(buf, arg));
                 *bufp = NULL;
         } else {
                 *bufp = buf;
@@ -2539,13 +2991,16 @@ arc_read_done(zio_t *zio)
         callback_list = hdr->b_acb;
         ASSERT(callback_list != NULL);
         if (BP_SHOULD_BYTESWAP(zio->io_bp) && zio->io_error == 0) {
-               arc_byteswap_func_t *func = BP_GET_LEVEL(zio->io_bp) > 0 ?
-                   byteswap_uint64_array :
-                   dmu_ot[BP_GET_TYPE(zio->io_bp)].ot_byteswap;
-               func(buf->b_data, hdr->b_size);
+               dmu_object_byteswap_t bswap =
+                   DMU_OT_BYTESWAP(BP_GET_TYPE(zio->io_bp));
+               if (BP_GET_LEVEL(zio->io_bp) > 0)
+                   byteswap_uint64_array(buf->b_data, hdr->b_size);
+               else
+                   dmu_ot_byteswap[bswap].ob_func(buf->b_data, hdr->b_size);
         }
  
         arc_cksum_compute(buf, B_FALSE);
+       arc_buf_watch(buf);
  
         if (hash_lock && zio->io_error == 0 && hdr->b_state == arc_anon) {
                 /*
@@ -2561,8 +3016,10 @@ arc_read_done(zio_t *zio)
         abuf = buf;
         for (acb = callback_list; acb; acb = acb->acb_next) {
                 if (acb->acb_done) {
-                       if (abuf == NULL)
+                       if (abuf == NULL) {
+                               ARCSTAT_BUMP(arcstat_duplicate_reads);
                                 abuf = arc_buf_clone(buf);
+                       }
                         acb->acb_buf = abuf;
                         abuf = NULL;
                 }
@@ -2626,7 +3083,7 @@ arc_read_done(zio_t *zio)
  }
  
  /*
- * "Read" the block block at the specified DVA (in bp) via the
+ * "Read" the block at the specified DVA (in bp) via the
   * cache.  If the block is found in the cache, invoke the provided
   * callback immediately and return.  Note that the `zio' parameter
   * in the callback will be NULL in this case, since no IO was
@@ -2642,48 +3099,18 @@ arc_read_done(zio_t *zio)
   *
   * arc_read_done() will invoke all the requested "done" functions
   * for readers of this block.
- *
- * Normal callers should use arc_read and pass the arc buffer and offset
- * for the bp.  But if you know you don't need locking, you can use
- * arc_read_bp.
   */
  int
-arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_buf_t *pbuf,
-    arc_done_func_t *done, void *private, int priority, int zio_flags,
-    uint32_t *arc_flags, const zbookmark_t *zb)
-{
-       int err;
-
-       if (pbuf == NULL) {
-               /*
-                * XXX This happens from traverse callback funcs, for
-                * the objset_phys_t block.
-                */
-               return (arc_read_nolock(pio, spa, bp, done, private, priority,
-                   zio_flags, arc_flags, zb));
-       }
-
-       ASSERT(!refcount_is_zero(&pbuf->b_hdr->b_refcnt));
-       ASSERT3U((char *)bp - (char *)pbuf->b_data, <, pbuf->b_hdr->b_size);
-       rw_enter(&pbuf->b_data_lock, RW_READER);
-
-       err = arc_read_nolock(pio, spa, bp, done, private, priority,
-           zio_flags, arc_flags, zb);
-       rw_exit(&pbuf->b_data_lock);
-
-       return (err);
-}
-
-int
-arc_read_nolock(zio_t *pio, spa_t *spa, const blkptr_t *bp,
-    arc_done_func_t *done, void *private, int priority, int zio_flags,
-    uint32_t *arc_flags, const zbookmark_t *zb)
+arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_done_func_t *done,
+    void *private, zio_priority_t priority, int zio_flags, uint32_t *arc_flags,
+    const zbookmark_t *zb)
  {
         arc_buf_hdr_t *hdr;
         arc_buf_t *buf = NULL;
         kmutex_t *hash_lock;
         zio_t *rzio;
-       uint64_t guid = spa_guid(spa);
+       uint64_t guid = spa_load_guid(spa);
+       int rc = 0;
  
  top:
         hdr = buf_hash_find(guid, BP_IDENTITY(bp), BP_PHYSICAL_BIRTH(bp),
@@ -2705,7 +3132,7 @@ top:
                                 arc_callback_t  *acb = NULL;
  
                                 acb = kmem_zalloc(sizeof (arc_callback_t),
-                                   KM_SLEEP);
+                                   KM_PUSHPAGE);
                                 acb->acb_done = done;
                                 acb->acb_private = private;
                                 if (pio != NULL)
@@ -2717,10 +3144,10 @@ top:
                                 hdr->b_acb = acb;
                                 add_reference(hdr, hash_lock, private);
                                 mutex_exit(hash_lock);
-                               return (0);
+                               goto out;
                         }
                         mutex_exit(hash_lock);
-                       return (0);
+                       goto out;
                 }
  
                 ASSERT(hdr->b_state == arc_mru || hdr->b_state == arc_mfu);
@@ -2750,6 +3177,8 @@ top:
                 arc_access(hdr, hash_lock);
                 if (*arc_flags & ARC_L2CACHE)
                         hdr->b_flags |= ARC_L2CACHE;
+               if (*arc_flags & ARC_L2COMPRESS)
+                       hdr->b_flags |= ARC_L2COMPRESS;
                 mutex_exit(hash_lock);
                 ARCSTAT_BUMP(arcstat_hits);
                 ARCSTAT_CONDSTAT(!(hdr->b_flags & ARC_PREFETCH),
@@ -2762,7 +3191,7 @@ top:
                 uint64_t size = BP_GET_LSIZE(bp);
                 arc_callback_t  *acb;
                 vdev_t *vd = NULL;
-               daddr_t addr = -1;
+               uint64_t addr = 0;
                 boolean_t devw = B_FALSE;
  
                 if (hdr == NULL) {
@@ -2790,13 +3219,15 @@ top:
                         }
                         if (*arc_flags & ARC_L2CACHE)
                                 hdr->b_flags |= ARC_L2CACHE;
+                       if (*arc_flags & ARC_L2COMPRESS)
+                               hdr->b_flags |= ARC_L2COMPRESS;
                         if (BP_GET_LEVEL(bp) > 0)
                                 hdr->b_flags |= ARC_INDIRECT;
                 } else {
                         /* this block is in the ghost cache */
                         ASSERT(GHOST_STATE(hdr->b_state));
                         ASSERT(!HDR_IO_IN_PROGRESS(hdr));
-                       ASSERT3U(refcount_count(&hdr->b_refcnt), ==, 0);
+                       ASSERT0(refcount_count(&hdr->b_refcnt));
                         ASSERT(hdr->b_buf == NULL);
  
                         /* if this is a prefetch, we don't have a reference */
@@ -2806,6 +3237,8 @@ top:
                                 add_reference(hdr, hash_lock, private);
                         if (*arc_flags & ARC_L2CACHE)
                                 hdr->b_flags |= ARC_L2CACHE;
+                       if (*arc_flags & ARC_L2COMPRESS)
+                               hdr->b_flags |= ARC_L2COMPRESS;
                         buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE);
                         buf->b_hdr = hdr;
                         buf->b_data = NULL;
@@ -2821,7 +3254,7 @@ top:
  
                 ASSERT(!GHOST_STATE(hdr->b_state));
  
-               acb = kmem_zalloc(sizeof (arc_callback_t), KM_SLEEP);
+               acb = kmem_zalloc(sizeof (arc_callback_t), KM_PUSHPAGE);
                 acb->acb_done = done;
                 acb->acb_private = private;
  
@@ -2843,6 +3276,10 @@ top:
  
                 mutex_exit(hash_lock);
  
+               /*
+                * At this point, we have a level 1 cache miss.  Try again in
+                * L2ARC if possible.
+                */
                 ASSERT3U(hdr->b_size, ==, size);
                 DTRACE_PROBE4(arc__miss, arc_buf_hdr_t *, hdr, blkptr_t *, bp,
                     uint64_t, size, zbookmark_t *, zb);
@@ -2868,37 +3305,58 @@ top:
  
                                 DTRACE_PROBE1(l2arc__hit, arc_buf_hdr_t *, hdr);
                                 ARCSTAT_BUMP(arcstat_l2_hits);
+                               atomic_inc_32(&hdr->b_l2hdr->b_hits);
  
                                 cb = kmem_zalloc(sizeof (l2arc_read_callback_t),
-                                   KM_SLEEP);
+                                   KM_PUSHPAGE);
                                 cb->l2rcb_buf = buf;
                                 cb->l2rcb_spa = spa;
                                 cb->l2rcb_bp = *bp;
                                 cb->l2rcb_zb = *zb;
                                 cb->l2rcb_flags = zio_flags;
+                               cb->l2rcb_compress = hdr->b_l2hdr->b_compress;
+
+                               ASSERT(addr >= VDEV_LABEL_START_SIZE &&
+                                   addr + size < vd->vdev_psize -
+                                   VDEV_LABEL_END_SIZE);
  
                                 /*
                                  * l2arc read.  The SCL_L2ARC lock will be
                                  * released by l2arc_read_done().
+                                * Issue a null zio if the underlying buffer
+                                * was squashed to zero size by compression.
                                  */
-                               rzio = zio_read_phys(pio, vd, addr, size,
-                                   buf->b_data, ZIO_CHECKSUM_OFF,
-                                   l2arc_read_done, cb, priority, zio_flags |
-                                   ZIO_FLAG_DONT_CACHE | ZIO_FLAG_CANFAIL |
-                                   ZIO_FLAG_DONT_PROPAGATE |
-                                   ZIO_FLAG_DONT_RETRY, B_FALSE);
+                               if (hdr->b_l2hdr->b_compress ==
+                                   ZIO_COMPRESS_EMPTY) {
+                                       rzio = zio_null(pio, spa, vd,
+                                           l2arc_read_done, cb,
+                                           zio_flags | ZIO_FLAG_DONT_CACHE |
+                                           ZIO_FLAG_CANFAIL |
+                                           ZIO_FLAG_DONT_PROPAGATE |
+                                           ZIO_FLAG_DONT_RETRY);
+                               } else {
+                                       rzio = zio_read_phys(pio, vd, addr,
+                                           hdr->b_l2hdr->b_asize,
+                                           buf->b_data, ZIO_CHECKSUM_OFF,
+                                           l2arc_read_done, cb, priority,
+                                           zio_flags | ZIO_FLAG_DONT_CACHE |
+                                           ZIO_FLAG_CANFAIL |
+                                           ZIO_FLAG_DONT_PROPAGATE |
+                                           ZIO_FLAG_DONT_RETRY, B_FALSE);
+                               }
                                 DTRACE_PROBE2(l2arc__read, vdev_t *, vd,
                                     zio_t *, rzio);
-                               ARCSTAT_INCR(arcstat_l2_read_bytes, size);
+                               ARCSTAT_INCR(arcstat_l2_read_bytes,
+                                   hdr->b_l2hdr->b_asize);
  
                                 if (*arc_flags & ARC_NOWAIT) {
                                         zio_nowait(rzio);
-                                       return (0);
+                                       goto out;
                                 }
  
                                 ASSERT(*arc_flags & ARC_WAIT);
                                 if (zio_wait(rzio) == 0)
-                                       return (0);
+                                       goto out;
  
                                 /* l2arc read error; goto zio_read() */
                         } else {
@@ -2922,13 +3380,49 @@ top:
                 rzio = zio_read(pio, spa, bp, buf->b_data, size,
                     arc_read_done, buf, priority, zio_flags, zb);
  
-               if (*arc_flags & ARC_WAIT)
-                       return (zio_wait(rzio));
+               if (*arc_flags & ARC_WAIT) {
+                       rc = zio_wait(rzio);
+                       goto out;
+               }
  
                 ASSERT(*arc_flags & ARC_NOWAIT);
                 zio_nowait(rzio);
         }
-       return (0);
+
+out:
+       spa_read_history_add(spa, zb, *arc_flags);
+       return (rc);
+}
+
+arc_prune_t *
+arc_add_prune_callback(arc_prune_func_t *func, void *private)
+{
+       arc_prune_t *p;
+
+       p = kmem_alloc(sizeof (*p), KM_SLEEP);
+       p->p_pfunc = func;
+       p->p_private = private;
+       list_link_init(&p->p_node);
+       refcount_create(&p->p_refcnt);
+
+       mutex_enter(&arc_prune_mtx);
+       refcount_add(&p->p_refcnt, &arc_prune_list);
+       list_insert_head(&arc_prune_list, p);
+       mutex_exit(&arc_prune_mtx);
+
+       return (p);
+}
+
+void
+arc_remove_prune_callback(arc_prune_t *p)
+{
+       mutex_enter(&arc_prune_mtx);
+       list_remove(&arc_prune_list, p);
+       if (refcount_remove(&p->p_refcnt, &arc_prune_list) == 0) {
+               refcount_destroy(&p->p_refcnt);
+               kmem_free(p, sizeof (*p));
+       }
+       mutex_exit(&arc_prune_mtx);
  }
  
  void
@@ -2944,6 +3438,34 @@ arc_set_callback(arc_buf_t *buf, arc_evict_func_t *func, void *private)
         buf->b_private = private;
  }
  
+/*
+ * Notify the arc that a block was freed, and thus will never be used again.
+ */
+void
+arc_freed(spa_t *spa, const blkptr_t *bp)
+{
+       arc_buf_hdr_t *hdr;
+       kmutex_t *hash_lock;
+       uint64_t guid = spa_load_guid(spa);
+
+       hdr = buf_hash_find(guid, BP_IDENTITY(bp), BP_PHYSICAL_BIRTH(bp),
+           &hash_lock);
+       if (hdr == NULL)
+               return;
+       if (HDR_BUF_AVAILABLE(hdr)) {
+               arc_buf_t *buf = hdr->b_buf;
+               add_reference(hdr, hash_lock, FTAG);
+               hdr->b_flags &= ~ARC_BUF_AVAILABLE;
+               mutex_exit(hash_lock);
+
+               arc_release(buf, FTAG);
+               (void) arc_buf_remove_ref(buf, FTAG);
+       } else {
+               mutex_exit(hash_lock);
+       }
+
+}
+
  /*
   * This is used by the DMU to let the ARC know that a buffer is
   * being evicted, so the ARC should clean up.  If this arc buf
@@ -3029,8 +3551,8 @@ arc_buf_evict(arc_buf_t *buf)
  }
  
  /*
- * Release this buffer from the cache.  This must be done
- * after a read and prior to modifying the buffer contents.
+ * Release this buffer from the cache, making it an anonymous buffer.  This
+ * must be done after a read and prior to modifying the buffer contents.
   * If the buffer has more than one reference, we must make
   * a new hdr for the buffer.
   */
@@ -3068,8 +3590,8 @@ arc_release(arc_buf_t *buf, void *tag)
         if (l2hdr) {
                 mutex_enter(&l2arc_buflist_mtx);
                 hdr->b_l2hdr = NULL;
-               buf_size = hdr->b_size;
         }
+       buf_size = hdr->b_size;
  
         /*
          * Do we have more than one buf?
@@ -3101,8 +3623,19 @@ arc_release(arc_buf_t *buf, void *tag)
                         ASSERT3U(*size, >=, hdr->b_size);
                         atomic_add_64(size, -hdr->b_size);
                 }
+
+               /*
+                * We're releasing a duplicate user data buffer, update
+                * our statistics accordingly.
+                */
+               if (hdr->b_type == ARC_BUFC_DATA) {
+                       ARCSTAT_BUMPDOWN(arcstat_duplicate_buffers);
+                       ARCSTAT_INCR(arcstat_duplicate_buffers_size,
+                           -hdr->b_size);
+               }
                 hdr->b_datacnt -= 1;
                 arc_cksum_verify(buf);
+               arc_buf_unwatch(buf);
  
                 mutex_exit(hash_lock);
  
@@ -3113,6 +3646,11 @@ arc_release(arc_buf_t *buf, void *tag)
                 nhdr->b_buf = buf;
                 nhdr->b_state = arc_anon;
                 nhdr->b_arc_access = 0;
+               nhdr->b_mru_hits = 0;
+               nhdr->b_mru_ghost_hits = 0;
+               nhdr->b_mfu_hits = 0;
+               nhdr->b_mfu_ghost_hits = 0;
+               nhdr->b_l2_hits = 0;
                 nhdr->b_flags = flags & ARC_L2_WRITING;
                 nhdr->b_l2hdr = NULL;
                 nhdr->b_datacnt = 1;
@@ -3129,6 +3667,11 @@ arc_release(arc_buf_t *buf, void *tag)
                 if (hdr->b_state != arc_anon)
                         arc_change_state(arc_anon, hdr, hash_lock);
                 hdr->b_arc_access = 0;
+               hdr->b_mru_hits = 0;
+               hdr->b_mru_ghost_hits = 0;
+               hdr->b_mfu_hits = 0;
+               hdr->b_mfu_ghost_hits = 0;
+               hdr->b_l2_hits = 0;
                 if (hash_lock)
                         mutex_exit(hash_lock);
  
@@ -3139,26 +3682,15 @@ arc_release(arc_buf_t *buf, void *tag)
         buf->b_private = NULL;
  
         if (l2hdr) {
+               ARCSTAT_INCR(arcstat_l2_asize, -l2hdr->b_asize);
                 list_remove(l2hdr->b_dev->l2ad_buflist, hdr);
                 kmem_free(l2hdr, sizeof (l2arc_buf_hdr_t));
+               arc_space_return(L2HDR_SIZE, ARC_SPACE_L2HDRS);
                 ARCSTAT_INCR(arcstat_l2_size, -buf_size);
                 mutex_exit(&l2arc_buflist_mtx);
         }
  }
  
-/*
- * Release this buffer.  If it does not match the provided BP, fill it
- * with that block's contents.
- */
-/* ARGSUSED */
-int
-arc_release_bp(arc_buf_t *buf, void *tag, blkptr_t *bp, spa_t *spa,
-    zbookmark_t *zb)
-{
-       arc_release(buf, tag);
-       return (0);
-}
-
  int
  arc_released(arc_buf_t *buf)
  {
@@ -3222,6 +3754,18 @@ arc_write_ready(zio_t *zio)
         hdr->b_flags |= ARC_IO_IN_PROGRESS;
  }
  
+/*
+ * The SPA calls this callback for each physical write that happens on behalf
+ * of a logical write.  See the comment in dbuf_write_physdone() for details.
+ */
+static void
+arc_write_physdone(zio_t *zio)
+{
+       arc_write_callback_t *cb = zio->io_private;
+       if (cb->awcb_physdone != NULL)
+               cb->awcb_physdone(zio, cb->awcb_buf, cb->awcb_private);
+}
+
  static void
  arc_write_done(zio_t *zio)
  {
@@ -3270,6 +3814,12 @@ arc_write_done(zio_t *zio)
                                 arc_hdr_destroy(exists);
                                 exists = buf_hash_insert(hdr, &hash_lock);
                                 ASSERT3P(exists, ==, NULL);
+                       } else if (zio->io_flags & ZIO_FLAG_NOPWRITE) {
+                               /* nopwrite */
+                               ASSERT(zio->io_prop.zp_nopwrite);
+                               if (!BP_EQUAL(&zio->io_bp_orig, zio->io_bp))
+                                       panic("bad nopwrite, hdr=%p exists=%p",
+                                           (void *)hdr, (void *)exists);
                         } else {
                                 /* Dedup */
                                 ASSERT(hdr->b_datacnt == 1);
@@ -3295,9 +3845,10 @@ arc_write_done(zio_t *zio)
  
  zio_t *
  arc_write(zio_t *pio, spa_t *spa, uint64_t txg,
-    blkptr_t *bp, arc_buf_t *buf, boolean_t l2arc, const zio_prop_t *zp,
-    arc_done_func_t *ready, arc_done_func_t *done, void *private,
-    int priority, int zio_flags, const zbookmark_t *zb)
+    blkptr_t *bp, arc_buf_t *buf, boolean_t l2arc, boolean_t l2arc_compress,
+    const zio_prop_t *zp, arc_done_func_t *ready, arc_done_func_t *physdone,
+    arc_done_func_t *done, void *private, zio_priority_t priority,
+    int zio_flags, const zbookmark_t *zb)
  {
         arc_buf_hdr_t *hdr = buf->b_hdr;
         arc_write_callback_t *callback;
@@ -3310,67 +3861,33 @@ arc_write(zio_t *pio, spa_t *spa, uint64_t txg,
         ASSERT(hdr->b_acb == NULL);
         if (l2arc)
                 hdr->b_flags |= ARC_L2CACHE;
-       callback = kmem_zalloc(sizeof (arc_write_callback_t), KM_SLEEP);
+       if (l2arc_compress)
+               hdr->b_flags |= ARC_L2COMPRESS;
+       callback = kmem_zalloc(sizeof (arc_write_callback_t), KM_PUSHPAGE);
         callback->awcb_ready = ready;
+       callback->awcb_physdone = physdone;
         callback->awcb_done = done;
         callback->awcb_private = private;
         callback->awcb_buf = buf;
  
         zio = zio_write(pio, spa, txg, bp, buf->b_data, hdr->b_size, zp,
-           arc_write_ready, arc_write_done, callback, priority, zio_flags, zb);
+           arc_write_ready, arc_write_physdone, arc_write_done, callback,
+           priority, zio_flags, zb);
  
         return (zio);
  }
  
  static int
-arc_memory_throttle(uint64_t reserve, uint64_t inflight_data, uint64_t txg)
+arc_memory_throttle(uint64_t reserve, uint64_t txg)
  {
  #ifdef _KERNEL
-       uint64_t available_memory = ptob(freemem);
-       static uint64_t page_load = 0;
-       static uint64_t last_txg = 0;
-
-#if defined(__i386)
-       available_memory =
-           MIN(available_memory, vmem_size(heap_arena, VMEM_FREE));
-#endif
-       if (available_memory >= zfs_write_limit_max)
-               return (0);
-
-       if (txg > last_txg) {
-               last_txg = txg;
-               page_load = 0;
-       }
-       /*
-        * If we are in pageout, we know that memory is already tight,
-        * the arc is already going to be evicting, so we just want to
-        * continue to let page writes occur as quickly as possible.
-        */
-       if (curproc == proc_pageout) {
-               if (page_load > MAX(ptob(minfree), available_memory) / 4)
-                       return (ERESTART);
-               /* Note: reserve is inflated, so we deflate */
-               page_load += reserve / 8;
+       if (zfs_arc_memory_throttle_disable)
                 return (0);
-       } else if (page_load > 0 && arc_reclaim_needed()) {
-               /* memory is low, delay before restarting */
-               ARCSTAT_INCR(arcstat_memory_throttle_count, 1);
-               return (EAGAIN);
-       }
-       page_load = 0;
-
-       if (arc_size > arc_c_min) {
-               uint64_t evictable_memory =
-                   arc_mru->arcs_lsize[ARC_BUFC_DATA] +
-                   arc_mru->arcs_lsize[ARC_BUFC_METADATA] +
-                   arc_mfu->arcs_lsize[ARC_BUFC_DATA] +
-                   arc_mfu->arcs_lsize[ARC_BUFC_METADATA];
-               available_memory += MIN(evictable_memory, arc_size - arc_c_min);
-       }
  
-       if (inflight_data > available_memory / 4) {
+       if (freemem <= physmem * arc_lotsfree_percent / 100) {
                 ARCSTAT_INCR(arcstat_memory_throttle_count, 1);
-               return (ERESTART);
+               DMU_TX_STAT_BUMP(dmu_tx_memory_reclaim);
+               return (SET_ERROR(EAGAIN));
         }
  #endif
         return (0);
@@ -3389,19 +3906,12 @@ arc_tempreserve_space(uint64_t reserve, uint64_t txg)
         int error;
         uint64_t anon_size;
  
-#ifdef ZFS_DEBUG
-       /*
-        * Once in a while, fail for no reason.  Everything should cope.
-        */
-       if (spa_get_random(10000) == 0) {
-               dprintf("forcing random failure\n");
-               return (ERESTART);
-       }
-#endif
         if (reserve > arc_c/4 && !arc_no_grow)
                 arc_c = MIN(arc_c_max, reserve * 4);
-       if (reserve > arc_c)
-               return (ENOMEM);
+       if (reserve > arc_c) {
+               DMU_TX_STAT_BUMP(dmu_tx_memory_reserve);
+               return (SET_ERROR(ENOMEM));
+       }
  
         /*
          * Don't count loaned bufs as in flight dirty data to prevent long
@@ -3412,10 +3922,11 @@ arc_tempreserve_space(uint64_t reserve, uint64_t txg)
  
         /*
          * Writes will, almost always, require additional memory allocations
-        * in order to compress/encrypt/etc the data.  We therefor need to
+        * in order to compress/encrypt/etc the data.  We therefore need to
          * make sure that there is sufficient available memory for this.
          */
-       if ((error = arc_memory_throttle(reserve, anon_size, txg)))
+       error = arc_memory_throttle(reserve, txg);
+       if (error != 0)
                 return (error);
  
         /*
@@ -3434,12 +3945,55 @@ arc_tempreserve_space(uint64_t reserve, uint64_t txg)
                     arc_anon->arcs_lsize[ARC_BUFC_METADATA]>>10,
                     arc_anon->arcs_lsize[ARC_BUFC_DATA]>>10,
                     reserve>>10, arc_c>>10);
-               return (ERESTART);
+               DMU_TX_STAT_BUMP(dmu_tx_dirty_throttle);
+               return (SET_ERROR(ERESTART));
         }
         atomic_add_64(&arc_tempreserve, reserve);
         return (0);
  }
  
+static void
+arc_kstat_update_state(arc_state_t *state, kstat_named_t *size,
+    kstat_named_t *evict_data, kstat_named_t *evict_metadata)
+{
+       size->value.ui64 = state->arcs_size;
+       evict_data->value.ui64 = state->arcs_lsize[ARC_BUFC_DATA];
+       evict_metadata->value.ui64 = state->arcs_lsize[ARC_BUFC_METADATA];
+}
+
+static int
+arc_kstat_update(kstat_t *ksp, int rw)
+{
+       arc_stats_t *as = ksp->ks_data;
+
+       if (rw == KSTAT_WRITE) {
+               return (SET_ERROR(EACCES));
+       } else {
+               arc_kstat_update_state(arc_anon,
+                   &as->arcstat_anon_size,
+                   &as->arcstat_anon_evict_data,
+                   &as->arcstat_anon_evict_metadata);
+               arc_kstat_update_state(arc_mru,
+                   &as->arcstat_mru_size,
+                   &as->arcstat_mru_evict_data,
+                   &as->arcstat_mru_evict_metadata);
+               arc_kstat_update_state(arc_mru_ghost,
+                   &as->arcstat_mru_ghost_size,
+                   &as->arcstat_mru_ghost_evict_data,
+                   &as->arcstat_mru_ghost_evict_metadata);
+               arc_kstat_update_state(arc_mfu,
+                   &as->arcstat_mfu_size,
+                   &as->arcstat_mfu_evict_data,
+                   &as->arcstat_mfu_evict_metadata);
+               arc_kstat_update_state(arc_mfu_ghost,
+                   &as->arcstat_mfu_ghost_size,
+                   &as->arcstat_mfu_ghost_evict_data,
+                   &as->arcstat_mfu_ghost_evict_metadata);
+       }
+
+       return (0);
+}
+
  void
  arc_init(void)
  {
@@ -3447,7 +4001,7 @@ arc_init(void)
         cv_init(&arc_reclaim_thr_cv, NULL, CV_DEFAULT, NULL);
  
         /* Convert seconds to clock ticks */
-       arc_min_prefetch_lifespan = 1 * hz;
+       zfs_arc_min_prefetch_lifespan = 1 * hz;
  
         /* Start out with 1/8 of all memory */
         arc_c = physmem * PAGESIZE / 8;
@@ -3459,16 +4013,18 @@ arc_init(void)
          * need to limit the cache to 1/8 of VM size.
          */
         arc_c = MIN(arc_c, vmem_size(heap_arena, VMEM_ALLOC | VMEM_FREE) / 8);
+       /*
+        * Register a shrinker to support synchronous (direct) memory
+        * reclaim from the arc.  This is done to prevent kswapd from
+        * swapping out pages when it is preferable to shrink the arc.
+        */
+       spl_register_shrinker(&arc_shrinker);
  #endif
  
         /* set min cache to 1/32 of all memory, or 64MB, whichever is more */
         arc_c_min = MAX(arc_c / 4, 64<<20);
-       /* set max to 3/4 of all memory, or all but 1GB, whichever is more */
-       if (arc_c * 8 >= 1<<30)
-               arc_c_max = (arc_c * 8) - (1<<30);
-       else
-               arc_c_max = arc_c_min;
-       arc_c_max = MAX(arc_c * 6, arc_c_max);
+       /* set max to 1/2 of all memory */
+       arc_c_max = arc_c * 4;
  
         /*
          * Allow the tunables to override our calculations if they are
@@ -3484,6 +4040,7 @@ arc_init(void)
  
         /* limit meta-data to 1/4 of the arc capacity */
         arc_meta_limit = arc_c_max / 4;
+       arc_meta_max = 0;
  
         /* Allow the tunable to override if it is reasonable */
         if (zfs_arc_meta_limit > 0 && zfs_arc_meta_limit <= arc_c_max)
@@ -3492,15 +4049,6 @@ arc_init(void)
         if (arc_c_min < arc_meta_limit / 2 && zfs_arc_min == 0)
                 arc_c_min = arc_meta_limit / 2;
  
-       if (zfs_arc_grow_retry > 0)
-               arc_grow_retry = zfs_arc_grow_retry;
-
-       if (zfs_arc_shrink_shift > 0)
-               arc_shrink_shift = zfs_arc_shrink_shift;
-
-       if (zfs_arc_p_min_shift > 0)
-               arc_p_min_shift = zfs_arc_p_min_shift;
-
         /* if kmem_flags are set, lets try to use less memory */
         if (kmem_debugging())
                 arc_c = arc_c / 2;
@@ -3543,10 +4091,20 @@ arc_init(void)
         list_create(&arc_l2c_only->arcs_list[ARC_BUFC_DATA],
             sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
  
+       arc_anon->arcs_state = ARC_STATE_ANON;
+       arc_mru->arcs_state = ARC_STATE_MRU;
+       arc_mru_ghost->arcs_state = ARC_STATE_MRU_GHOST;
+       arc_mfu->arcs_state = ARC_STATE_MFU;
+       arc_mfu_ghost->arcs_state = ARC_STATE_MFU_GHOST;
+       arc_l2c_only->arcs_state = ARC_STATE_L2C_ONLY;
+
         buf_init();
  
         arc_thread_exit = 0;
+       list_create(&arc_prune_list, sizeof (arc_prune_t),
+           offsetof(arc_prune_t, p_node));
         arc_eviction_list = NULL;
+       mutex_init(&arc_prune_mtx, NULL, MUTEX_DEFAULT, NULL);
         mutex_init(&arc_eviction_mtx, NULL, MUTEX_DEFAULT, NULL);
         bzero(&arc_eviction_hdr, sizeof (arc_buf_hdr_t));
  
@@ -3555,26 +4113,46 @@ arc_init(void)
  
         if (arc_ksp != NULL) {
                 arc_ksp->ks_data = &arc_stats;
+               arc_ksp->ks_update = arc_kstat_update;
                 kstat_install(arc_ksp);
         }
  
-       (void) thread_create(NULL, 0, arc_reclaim_thread, NULL, 0, &p0,
+       (void) thread_create(NULL, 0, arc_adapt_thread, NULL, 0, &p0,
             TS_RUN, minclsyspri);
  
         arc_dead = FALSE;
         arc_warm = B_FALSE;
  
-       if (zfs_write_limit_max == 0)
-               zfs_write_limit_max = ptob(physmem) >> zfs_write_limit_shift;
-       else
-               zfs_write_limit_shift = 0;
-       mutex_init(&zfs_write_limit_lock, NULL, MUTEX_DEFAULT, NULL);
+       /*
+        * Calculate maximum amount of dirty data per pool.
+        *
+        * If it has been set by a module parameter, take that.
+        * Otherwise, use a percentage of physical memory defined by
+        * zfs_dirty_data_max_percent (default 10%) with a cap at
+        * zfs_dirty_data_max_max (default 25% of physical memory).
+        */
+       if (zfs_dirty_data_max_max == 0)
+               zfs_dirty_data_max_max = physmem * PAGESIZE *
+                   zfs_dirty_data_max_max_percent / 100;
+
+       if (zfs_dirty_data_max == 0) {
+               zfs_dirty_data_max = physmem * PAGESIZE *
+                   zfs_dirty_data_max_percent / 100;
+               zfs_dirty_data_max = MIN(zfs_dirty_data_max,
+                   zfs_dirty_data_max_max);
+       }
  }
  
  void
  arc_fini(void)
  {
+       arc_prune_t *p;
+
         mutex_enter(&arc_reclaim_thr_lock);
+#ifdef _KERNEL
+       spl_unregister_shrinker(&arc_shrinker);
+#endif /* _KERNEL */
+
         arc_thread_exit = 1;
         while (arc_thread_exit != 0)
                 cv_wait(&arc_reclaim_thr_cv, &arc_reclaim_thr_lock);
@@ -3589,6 +4167,17 @@ arc_fini(void)
                 arc_ksp = NULL;
         }
  
+       mutex_enter(&arc_prune_mtx);
+       while ((p = list_head(&arc_prune_list)) != NULL) {
+               list_remove(&arc_prune_list, p);
+               refcount_remove(&p->p_refcnt, &arc_prune_list);
+               refcount_destroy(&p->p_refcnt);
+               kmem_free(p, sizeof (*p));
+       }
+       mutex_exit(&arc_prune_mtx);
+
+       list_destroy(&arc_prune_list);
+       mutex_destroy(&arc_prune_mtx);
         mutex_destroy(&arc_eviction_mtx);
         mutex_destroy(&arc_reclaim_thr_lock);
         cv_destroy(&arc_reclaim_thr_cv);
@@ -3609,8 +4198,6 @@ arc_fini(void)
         mutex_destroy(&arc_mfu_ghost->arcs_mtx);
         mutex_destroy(&arc_l2c_only->arcs_mtx);
  
-       mutex_destroy(&zfs_write_limit_lock);
-
         buf_fini();
  
         ASSERT(arc_loaned_bytes == 0);
@@ -3671,8 +4258,12 @@ arc_fini(void)
   * 2. The L2ARC attempts to cache data from the ARC before it is evicted.
   * It does this by periodically scanning buffers from the eviction-end of
   * the MFU and MRU ARC lists, copying them to the L2ARC devices if they are
- * not already there.  It scans until a headroom of buffers is satisfied,
- * which itself is a buffer for ARC eviction.  The thread that does this is
+ * not already there. It scans until a headroom of buffers is satisfied,
+ * which itself is a buffer for ARC eviction. If a compressible buffer is
+ * found during scanning and selected for writing to an L2ARC device, we
+ * temporarily boost scanning headroom during the next scan cycle to make
+ * sure we adapt to compression effects (which might significantly reduce
+ * the data volume we write to L2ARC). The thread that does this is
   * l2arc_feed_thread(), illustrated below; example sizes are included to
   * provide a better sense of ratio than this diagram:
   *
@@ -3736,7 +4327,13 @@ arc_fini(void)
   *     l2arc_write_max         max write bytes per interval
   *     l2arc_write_boost       extra write bytes during device warmup
   *     l2arc_noprefetch        skip caching prefetched buffers
+ *     l2arc_nocompress        skip compressing buffers
   *     l2arc_headroom          number of max device writes to precache
+ *     l2arc_headroom_boost    when we find compressed buffers during ARC
+ *                             scanning, we multiply headroom by this
+ *                             percentage factor for the next scan cycle,
+ *                             since more compressed buffers are likely to
+ *                             be present
   *     l2arc_feed_secs         seconds between L2ARC writing
   *
   * Tunables may be removed or added as future performance improvements are
@@ -3770,14 +4367,24 @@ l2arc_write_eligible(uint64_t spa_guid, arc_buf_hdr_t *ab)
  }
  
  static uint64_t
-l2arc_write_size(l2arc_dev_t *dev)
+l2arc_write_size(void)
  {
         uint64_t size;
  
-       size = dev->l2ad_write;
+       /*
+        * Make sure our globals have meaningful values in case the user
+        * altered them.
+        */
+       size = l2arc_write_max;
+       if (size == 0) {
+               cmn_err(CE_NOTE, "Bad value for l2arc_write_max, value must "
+                   "be greater than zero, resetting it to the default (%d)",
+                   L2ARC_WRITE_SIZE);
+               size = l2arc_write_max = L2ARC_WRITE_SIZE;
+       }
  
         if (arc_warm == B_FALSE)
-               size += dev->l2ad_boost;
+               size += l2arc_write_boost;
  
         return (size);
  
@@ -3808,14 +4415,14 @@ l2arc_write_interval(clock_t began, uint64_t wanted, uint64_t wrote)
  static void
  l2arc_hdr_stat_add(void)
  {
-       ARCSTAT_INCR(arcstat_l2_hdr_size, HDR_SIZE + L2HDR_SIZE);
+       ARCSTAT_INCR(arcstat_l2_hdr_size, HDR_SIZE);
         ARCSTAT_INCR(arcstat_hdr_size, -HDR_SIZE);
  }
  
  static void
  l2arc_hdr_stat_remove(void)
  {
-       ARCSTAT_INCR(arcstat_l2_hdr_size, -(HDR_SIZE + L2HDR_SIZE));
+       ARCSTAT_INCR(arcstat_l2_hdr_size, -HDR_SIZE);
         ARCSTAT_INCR(arcstat_hdr_size, HDR_SIZE);
  }
  
@@ -3939,6 +4546,13 @@ l2arc_write_done(zio_t *zio)
          */
         for (ab = list_prev(buflist, head); ab; ab = ab_prev) {
                 ab_prev = list_prev(buflist, ab);
+               abl2 = ab->b_l2hdr;
+
+               /*
+                * Release the temporary compressed buffer as soon as possible.
+                */
+               if (abl2->b_compress != ZIO_COMPRESS_OFF)
+                       l2arc_release_cdata_buf(ab);
  
                 hash_lock = HDR_LOCK(ab);
                 if (!mutex_tryenter(hash_lock)) {
@@ -3956,9 +4570,10 @@ l2arc_write_done(zio_t *zio)
                          * Error - drop L2ARC entry.
                          */
                         list_remove(buflist, ab);
-                       abl2 = ab->b_l2hdr;
+                       ARCSTAT_INCR(arcstat_l2_asize, -abl2->b_asize);
                         ab->b_l2hdr = NULL;
                         kmem_free(abl2, sizeof (l2arc_buf_hdr_t));
+                       arc_space_return(L2HDR_SIZE, ARC_SPACE_L2HDRS);
                         ARCSTAT_INCR(arcstat_l2_size, -ab->b_size);
                 }
  
@@ -4008,6 +4623,13 @@ l2arc_read_done(zio_t *zio)
         hdr = buf->b_hdr;
         ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
  
+       /*
+        * If the buffer was compressed, decompress it first.
+        */
+       if (cb->l2rcb_compress != ZIO_COMPRESS_OFF)
+               l2arc_decompress_zio(zio, hdr, cb->l2rcb_compress);
+       ASSERT(zio->io_data != NULL);
+
         /*
          * Check this survived the L2ARC journey.
          */
@@ -4027,7 +4649,7 @@ l2arc_read_done(zio_t *zio)
                 if (zio->io_error != 0) {
                         ARCSTAT_BUMP(arcstat_l2_io_error);
                 } else {
-                       zio->io_error = EIO;
+                       zio->io_error = SET_ERROR(EIO);
                 }
                 if (!equal)
                         ARCSTAT_BUMP(arcstat_l2_cksum_bad);
@@ -4203,8 +4825,10 @@ top:
                          */
                         if (ab->b_l2hdr != NULL) {
                                 abl2 = ab->b_l2hdr;
+                               ARCSTAT_INCR(arcstat_l2_asize, -abl2->b_asize);
                                 ab->b_l2hdr = NULL;
                                 kmem_free(abl2, sizeof (l2arc_buf_hdr_t));
+                               arc_space_return(L2HDR_SIZE, ARC_SPACE_L2HDRS);
                                 ARCSTAT_INCR(arcstat_l2_size, -ab->b_size);
                         }
                         list_remove(buflist, ab);
@@ -4228,37 +4852,54 @@ top:
   *
   * An ARC_L2_WRITING flag is set so that the L2ARC buffers are not valid
   * for reading until they have completed writing.
+ * The headroom_boost is an in-out parameter used to maintain headroom boost
+ * state between calls to this function.
+ *
+ * Returns the number of bytes actually written (which may be smaller than
+ * the delta by which the device hand has changed due to alignment).
   */
  static uint64_t
-l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz)
+l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz,
+    boolean_t *headroom_boost)
  {
         arc_buf_hdr_t *ab, *ab_prev, *head;
-       l2arc_buf_hdr_t *hdrl2;
         list_t *list;
-       uint64_t passed_sz, write_sz, buf_sz, headroom;
+       uint64_t write_asize, write_psize, write_sz, headroom,
+           buf_compress_minsz;
         void *buf_data;
-       kmutex_t *hash_lock, *list_lock = NULL;
-       boolean_t have_lock, full;
+       kmutex_t *list_lock = NULL;
+       boolean_t full;
         l2arc_write_callback_t *cb;
         zio_t *pio, *wzio;
-       uint64_t guid = spa_guid(spa);
+       uint64_t guid = spa_load_guid(spa);
         int try;
+       const boolean_t do_headroom_boost = *headroom_boost;
  
         ASSERT(dev->l2ad_vdev != NULL);
  
+       /* Lower the flag now, we might want to raise it again later. */
+       *headroom_boost = B_FALSE;
+
         pio = NULL;
-       write_sz = 0;
+       write_sz = write_asize = write_psize = 0;
         full = B_FALSE;
         head = kmem_cache_alloc(hdr_cache, KM_PUSHPAGE);
         head->b_flags |= ARC_L2_WRITE_HEAD;
  
+       /*
+        * We will want to try to compress buffers that are at least 2x the
+        * device sector size.
+        */
+       buf_compress_minsz = 2 << dev->l2ad_vdev->vdev_ashift;
+
         /*
          * Copy buffers for L2ARC writing.
          */
         mutex_enter(&l2arc_buflist_mtx);
         for (try = 0; try <= 3; try++) {
+               uint64_t passed_sz = 0;
+
                 list = l2arc_list_locked(try, &list_lock);
-               passed_sz = 0;
  
                 /*
                  * L2ARC fast warmup.
@@ -4266,21 +4907,27 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz)
                  * Until the ARC is warm and starts to evict, read from the
                  * head of the ARC lists rather than the tail.
                  */
-               headroom = target_sz * l2arc_headroom;
                 if (arc_warm == B_FALSE)
                         ab = list_head(list);
                 else
                         ab = list_tail(list);
  
+               headroom = target_sz * l2arc_headroom;
+               if (do_headroom_boost)
+                       headroom = (headroom * l2arc_headroom_boost) / 100;
+
                 for (; ab; ab = ab_prev) {
+                       l2arc_buf_hdr_t *l2hdr;
+                       kmutex_t *hash_lock;
+                       uint64_t buf_sz;
+
                         if (arc_warm == B_FALSE)
                                 ab_prev = list_next(list, ab);
                         else
                                 ab_prev = list_prev(list, ab);
  
                         hash_lock = HDR_LOCK(ab);
-                       have_lock = MUTEX_HELD(hash_lock);
-                       if (!have_lock && !mutex_tryenter(hash_lock)) {
+                       if (!mutex_tryenter(hash_lock)) {
                                 /*
                                  * Skip this buffer rather than waiting.
                                  */
@@ -4315,8 +4962,8 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz)
                                  */
                                 list_insert_head(dev->l2ad_buflist, head);
  
-                               cb = kmem_alloc(
-                                   sizeof (l2arc_write_callback_t), KM_SLEEP);
+                               cb = kmem_alloc(sizeof (l2arc_write_callback_t),
+                                   KM_PUSHPAGE);
                                 cb->l2wcb_dev = dev;
                                 cb->l2wcb_head = head;
                                 pio = zio_root(spa, l2arc_write_done, cb,
@@ -4326,15 +4973,30 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz)
                         /*
                          * Create and add a new L2ARC header.
                          */
-                       hdrl2 = kmem_zalloc(sizeof (l2arc_buf_hdr_t), KM_SLEEP);
-                       hdrl2->b_dev = dev;
-                       hdrl2->b_daddr = dev->l2ad_hand;
+                       l2hdr = kmem_zalloc(sizeof (l2arc_buf_hdr_t),
+                           KM_PUSHPAGE);
+                       l2hdr->b_dev = dev;
+                       arc_space_consume(L2HDR_SIZE, ARC_SPACE_L2HDRS);
  
                         ab->b_flags |= ARC_L2_WRITING;
-                       ab->b_l2hdr = hdrl2;
-                       list_insert_head(dev->l2ad_buflist, ab);
-                       buf_data = ab->b_buf->b_data;
+
+                       /*
+                        * Temporarily stash the data buffer in b_tmp_cdata.
+                        * The subsequent write step will pick it up from
+                        * there. This is because can't access ab->b_buf
+                        * without holding the hash_lock, which we in turn
+                        * can't access without holding the ARC list locks
+                        * (which we want to avoid during compression/writing)
+                        */
+                       l2hdr->b_compress = ZIO_COMPRESS_OFF;
+                       l2hdr->b_asize = ab->b_size;
+                       l2hdr->b_tmp_cdata = ab->b_buf->b_data;
+                       l2hdr->b_hits = 0;
+
                         buf_sz = ab->b_size;
+                       ab->b_l2hdr = l2hdr;
+
+                       list_insert_head(dev->l2ad_buflist, ab);
  
                         /*
                          * Compute and store the buffer cksum before
@@ -4345,6 +5007,64 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz)
  
                         mutex_exit(hash_lock);
  
+                       write_sz += buf_sz;
+               }
+
+               mutex_exit(list_lock);
+
+               if (full == B_TRUE)
+                       break;
+       }
+
+       /* No buffers selected for writing? */
+       if (pio == NULL) {
+               ASSERT0(write_sz);
+               mutex_exit(&l2arc_buflist_mtx);
+               kmem_cache_free(hdr_cache, head);
+               return (0);
+       }
+
+       /*
+        * Now start writing the buffers. We're starting at the write head
+        * and work backwards, retracing the course of the buffer selector
+        * loop above.
+        */
+       for (ab = list_prev(dev->l2ad_buflist, head); ab;
+           ab = list_prev(dev->l2ad_buflist, ab)) {
+               l2arc_buf_hdr_t *l2hdr;
+               uint64_t buf_sz;
+
+               /*
+                * We shouldn't need to lock the buffer here, since we flagged
+                * it as ARC_L2_WRITING in the previous step, but we must take
+                * care to only access its L2 cache parameters. In particular,
+                * ab->b_buf may be invalid by now due to ARC eviction.
+                */
+               l2hdr = ab->b_l2hdr;
+               l2hdr->b_daddr = dev->l2ad_hand;
+
+               if (!l2arc_nocompress && (ab->b_flags & ARC_L2COMPRESS) &&
+                   l2hdr->b_asize >= buf_compress_minsz) {
+                       if (l2arc_compress_buf(l2hdr)) {
+                               /*
+                                * If compression succeeded, enable headroom
+                                * boost on the next scan cycle.
+                                */
+                               *headroom_boost = B_TRUE;
+                       }
+               }
+
+               /*
+                * Pick up the buffer data we had previously stashed away
+                * (and now potentially also compressed).
+                */
+               buf_data = l2hdr->b_tmp_cdata;
+               buf_sz = l2hdr->b_asize;
+
+               /* Compression may have squashed the buffer to zero length. */
+               if (buf_sz != 0) {
+                       uint64_t buf_p_sz;
+
                         wzio = zio_write_phys(pio, dev->l2ad_vdev,
                             dev->l2ad_hand, buf_sz, buf_data, ZIO_CHECKSUM_OFF,
                             NULL, NULL, ZIO_PRIORITY_ASYNC_WRITE,
@@ -4354,33 +5074,24 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz)
                             zio_t *, wzio);
                         (void) zio_nowait(wzio);
  
+                       write_asize += buf_sz;
                         /*
                          * Keep the clock hand suitably device-aligned.
                          */
-                       buf_sz = vdev_psize_to_asize(dev->l2ad_vdev, buf_sz);
-
-                       write_sz += buf_sz;
-                       dev->l2ad_hand += buf_sz;
+                       buf_p_sz = vdev_psize_to_asize(dev->l2ad_vdev, buf_sz);
+                       write_psize += buf_p_sz;
+                       dev->l2ad_hand += buf_p_sz;
                 }
-
-               mutex_exit(list_lock);
-
-               if (full == B_TRUE)
-                       break;
         }
-       mutex_exit(&l2arc_buflist_mtx);
  
-       if (pio == NULL) {
-               ASSERT3U(write_sz, ==, 0);
-               kmem_cache_free(hdr_cache, head);
-               return (0);
-       }
+       mutex_exit(&l2arc_buflist_mtx);
  
-       ASSERT3U(write_sz, <=, target_sz);
+       ASSERT3U(write_asize, <=, target_sz);
         ARCSTAT_BUMP(arcstat_l2_writes_sent);
-       ARCSTAT_INCR(arcstat_l2_write_bytes, write_sz);
+       ARCSTAT_INCR(arcstat_l2_write_bytes, write_asize);
         ARCSTAT_INCR(arcstat_l2_size, write_sz);
-       vdev_space_update(dev->l2ad_vdev, write_sz, 0, 0);
+       ARCSTAT_INCR(arcstat_l2_asize, write_asize);
+       vdev_space_update(dev->l2ad_vdev, write_psize, 0, 0);
  
         /*
          * Bump device hand to the device start if it is approaching the end.
@@ -4398,7 +5109,153 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz)
         (void) zio_wait(pio);
         dev->l2ad_writing = B_FALSE;
  
-       return (write_sz);
+       return (write_asize);
+}
+
+/*
+ * Compresses an L2ARC buffer.
+ * The data to be compressed must be prefilled in l2hdr->b_tmp_cdata and its
+ * size in l2hdr->b_asize. This routine tries to compress the data and
+ * depending on the compression result there are three possible outcomes:
+ * *) The buffer was incompressible. The original l2hdr contents were left
+ *    untouched and are ready for writing to an L2 device.
+ * *) The buffer was all-zeros, so there is no need to write it to an L2
+ *    device. To indicate this situation b_tmp_cdata is NULL'ed, b_asize is
+ *    set to zero and b_compress is set to ZIO_COMPRESS_EMPTY.
+ * *) Compression succeeded and b_tmp_cdata was replaced with a temporary
+ *    data buffer which holds the compressed data to be written, and b_asize
+ *    tells us how much data there is. b_compress is set to the appropriate
+ *    compression algorithm. Once writing is done, invoke
+ *    l2arc_release_cdata_buf on this l2hdr to free this temporary buffer.
+ *
+ * Returns B_TRUE if compression succeeded, or B_FALSE if it didn't (the
+ * buffer was incompressible).
+ */
+static boolean_t
+l2arc_compress_buf(l2arc_buf_hdr_t *l2hdr)
+{
+       void *cdata;
+       size_t csize, len;
+
+       ASSERT(l2hdr->b_compress == ZIO_COMPRESS_OFF);
+       ASSERT(l2hdr->b_tmp_cdata != NULL);
+
+       len = l2hdr->b_asize;
+       cdata = zio_data_buf_alloc(len);
+       csize = zio_compress_data(ZIO_COMPRESS_LZ4, l2hdr->b_tmp_cdata,
+           cdata, l2hdr->b_asize);
+
+       if (csize == 0) {
+               /* zero block, indicate that there's nothing to write */
+               zio_data_buf_free(cdata, len);
+               l2hdr->b_compress = ZIO_COMPRESS_EMPTY;
+               l2hdr->b_asize = 0;
+               l2hdr->b_tmp_cdata = NULL;
+               ARCSTAT_BUMP(arcstat_l2_compress_zeros);
+               return (B_TRUE);
+       } else if (csize > 0 && csize < len) {
+               /*
+                * Compression succeeded, we'll keep the cdata around for
+                * writing and release it afterwards.
+                */
+               l2hdr->b_compress = ZIO_COMPRESS_LZ4;
+               l2hdr->b_asize = csize;
+               l2hdr->b_tmp_cdata = cdata;
+               ARCSTAT_BUMP(arcstat_l2_compress_successes);
+               return (B_TRUE);
+       } else {
+               /*
+                * Compression failed, release the compressed buffer.
+                * l2hdr will be left unmodified.
+                */
+               zio_data_buf_free(cdata, len);
+               ARCSTAT_BUMP(arcstat_l2_compress_failures);
+               return (B_FALSE);
+       }
+}
+
+/*
+ * Decompresses a zio read back from an l2arc device. On success, the
+ * underlying zio's io_data buffer is overwritten by the uncompressed
+ * version. On decompression error (corrupt compressed stream), the
+ * zio->io_error value is set to signal an I/O error.
+ *
+ * Please note that the compressed data stream is not checksummed, so
+ * if the underlying device is experiencing data corruption, we may feed
+ * corrupt data to the decompressor, so the decompressor needs to be
+ * able to handle this situation (LZ4 does).
+ */
+static void
+l2arc_decompress_zio(zio_t *zio, arc_buf_hdr_t *hdr, enum zio_compress c)
+{
+       uint64_t csize;
+       void *cdata;
+
+       ASSERT(L2ARC_IS_VALID_COMPRESS(c));
+
+       if (zio->io_error != 0) {
+               /*
+                * An io error has occured, just restore the original io
+                * size in preparation for a main pool read.
+                */
+               zio->io_orig_size = zio->io_size = hdr->b_size;
+               return;
+       }
+
+       if (c == ZIO_COMPRESS_EMPTY) {
+               /*
+                * An empty buffer results in a null zio, which means we
+                * need to fill its io_data after we're done restoring the
+                * buffer's contents.
+                */
+               ASSERT(hdr->b_buf != NULL);
+               bzero(hdr->b_buf->b_data, hdr->b_size);
+               zio->io_data = zio->io_orig_data = hdr->b_buf->b_data;
+       } else {
+               ASSERT(zio->io_data != NULL);
+               /*
+                * We copy the compressed data from the start of the arc buffer
+                * (the zio_read will have pulled in only what we need, the
+                * rest is garbage which we will overwrite at decompression)
+                * and then decompress back to the ARC data buffer. This way we
+                * can minimize copying by simply decompressing back over the
+                * original compressed data (rather than decompressing to an
+                * aux buffer and then copying back the uncompressed buffer,
+                * which is likely to be much larger).
+                */
+               csize = zio->io_size;
+               cdata = zio_data_buf_alloc(csize);
+               bcopy(zio->io_data, cdata, csize);
+               if (zio_decompress_data(c, cdata, zio->io_data, csize,
+                   hdr->b_size) != 0)
+                       zio->io_error = SET_ERROR(EIO);
+               zio_data_buf_free(cdata, csize);
+       }
+
+       /* Restore the expected uncompressed IO size. */
+       zio->io_orig_size = zio->io_size = hdr->b_size;
+}
+
+/*
+ * Releases the temporary b_tmp_cdata buffer in an l2arc header structure.
+ * This buffer serves as a temporary holder of compressed data while
+ * the buffer entry is being written to an l2arc device. Once that is
+ * done, we can dispose of it.
+ */
+static void
+l2arc_release_cdata_buf(arc_buf_hdr_t *ab)
+{
+       l2arc_buf_hdr_t *l2hdr = ab->b_l2hdr;
+
+       if (l2hdr->b_compress == ZIO_COMPRESS_LZ4) {
+               /*
+                * If the data was compressed, then we've allocated a
+                * temporary buffer for it, so now we need to release it.
+                */
+               ASSERT(l2hdr->b_tmp_cdata != NULL);
+               zio_data_buf_free(l2hdr->b_tmp_cdata, ab->b_size);
+       }
+       l2hdr->b_tmp_cdata = NULL;
  }
  
  /*
@@ -4413,6 +5270,7 @@ l2arc_feed_thread(void)
         spa_t *spa;
         uint64_t size, wrote;
         clock_t begin, next = ddi_get_lbolt();
+       boolean_t headroom_boost = B_FALSE;
  
         CALLB_CPR_INIT(&cpr, &l2arc_feed_thr_lock, callb_generic_cpr, FTAG);
  
@@ -4420,8 +5278,8 @@ l2arc_feed_thread(void)
  
         while (l2arc_thread_exit == 0) {
                 CALLB_CPR_SAFE_BEGIN(&cpr);
-               (void) cv_timedwait(&l2arc_feed_thr_cv, &l2arc_feed_thr_lock,
-                   next);
+               (void) cv_timedwait_interruptible(&l2arc_feed_thr_cv,
+                   &l2arc_feed_thr_lock, next);
                 CALLB_CPR_SAFE_END(&cpr, &l2arc_feed_thr_lock);
                 next = ddi_get_lbolt() + hz;
  
@@ -4465,7 +5323,7 @@ l2arc_feed_thread(void)
                 /*
                  * Avoid contributing to memory pressure.
                  */
-               if (arc_reclaim_needed()) {
+               if (arc_no_grow) {
                         ARCSTAT_BUMP(arcstat_l2_abort_lowmem);
                         spa_config_exit(spa, SCL_L2ARC, dev);
                         continue;
@@ -4473,7 +5331,7 @@ l2arc_feed_thread(void)
  
                 ARCSTAT_BUMP(arcstat_l2_feeds);
  
-               size = l2arc_write_size(dev);
+               size = l2arc_write_size();
  
                 /*
                  * Evict L2ARC buffers that will be overwritten.
@@ -4483,7 +5341,7 @@ l2arc_feed_thread(void)
                 /*
                  * Write ARC buffers.
                  */
-               wrote = l2arc_write_buffers(spa, dev, size);
+               wrote = l2arc_write_buffers(spa, dev, size, &headroom_boost);
  
                 /*
                  * Calculate interval between writes.
@@ -4531,8 +5389,6 @@ l2arc_add_vdev(spa_t *spa, vdev_t *vd)
         adddev = kmem_zalloc(sizeof (l2arc_dev_t), KM_SLEEP);
         adddev->l2ad_spa = spa;
         adddev->l2ad_vdev = vd;
-       adddev->l2ad_write = l2arc_write_max;
-       adddev->l2ad_boost = l2arc_write_boost;
         adddev->l2ad_start = VDEV_LABEL_START_SIZE;
         adddev->l2ad_end = VDEV_LABEL_START_SIZE + vdev_get_min_asize(vd);
         adddev->l2ad_hand = adddev->l2ad_start;
@@ -4540,7 +5396,6 @@ l2arc_add_vdev(spa_t *spa, vdev_t *vd)
         adddev->l2ad_first = B_TRUE;
         adddev->l2ad_writing = B_FALSE;
         list_link_init(&adddev->l2ad_node);
-       ASSERT3U(adddev->l2ad_write, >, 0);
  
         /*
          * This is a list of all ARC buffers that are still valid on the
@@ -4665,3 +5520,73 @@ l2arc_stop(void)
                 cv_wait(&l2arc_feed_thr_cv, &l2arc_feed_thr_lock);
         mutex_exit(&l2arc_feed_thr_lock);
  }
+
+#if defined(_KERNEL) && defined(HAVE_SPL)
+EXPORT_SYMBOL(arc_read);
+EXPORT_SYMBOL(arc_buf_remove_ref);
+EXPORT_SYMBOL(arc_buf_info);
+EXPORT_SYMBOL(arc_getbuf_func);
+EXPORT_SYMBOL(arc_add_prune_callback);
+EXPORT_SYMBOL(arc_remove_prune_callback);
+
+module_param(zfs_arc_min, ulong, 0644);
+MODULE_PARM_DESC(zfs_arc_min, "Min arc size");
+
+module_param(zfs_arc_max, ulong, 0644);
+MODULE_PARM_DESC(zfs_arc_max, "Max arc size");
+
+module_param(zfs_arc_meta_limit, ulong, 0644);
+MODULE_PARM_DESC(zfs_arc_meta_limit, "Meta limit for arc size");
+
+module_param(zfs_arc_meta_prune, int, 0644);
+MODULE_PARM_DESC(zfs_arc_meta_prune, "Bytes of meta data to prune");
+
+module_param(zfs_arc_grow_retry, int, 0644);
+MODULE_PARM_DESC(zfs_arc_grow_retry, "Seconds before growing arc size");
+
+module_param(zfs_arc_shrink_shift, int, 0644);
+MODULE_PARM_DESC(zfs_arc_shrink_shift, "log2(fraction of arc to reclaim)");
+
+module_param(zfs_arc_p_min_shift, int, 0644);
+MODULE_PARM_DESC(zfs_arc_p_min_shift, "arc_c shift to calc min/max arc_p");
+
+module_param(zfs_disable_dup_eviction, int, 0644);
+MODULE_PARM_DESC(zfs_disable_dup_eviction, "disable duplicate buffer eviction");
+
+module_param(zfs_arc_memory_throttle_disable, int, 0644);
+MODULE_PARM_DESC(zfs_arc_memory_throttle_disable, "disable memory throttle");
+
+module_param(zfs_arc_min_prefetch_lifespan, int, 0644);
+MODULE_PARM_DESC(zfs_arc_min_prefetch_lifespan, "Min life of prefetch block");
+
+module_param(l2arc_write_max, ulong, 0644);
+MODULE_PARM_DESC(l2arc_write_max, "Max write bytes per interval");
+
+module_param(l2arc_write_boost, ulong, 0644);
+MODULE_PARM_DESC(l2arc_write_boost, "Extra write bytes during device warmup");
+
+module_param(l2arc_headroom, ulong, 0644);
+MODULE_PARM_DESC(l2arc_headroom, "Number of max device writes to precache");
+
+module_param(l2arc_headroom_boost, ulong, 0644);
+MODULE_PARM_DESC(l2arc_headroom_boost, "Compressed l2arc_headroom multiplier");
+
+module_param(l2arc_feed_secs, ulong, 0644);
+MODULE_PARM_DESC(l2arc_feed_secs, "Seconds between L2ARC writing");
+
+module_param(l2arc_feed_min_ms, ulong, 0644);
+MODULE_PARM_DESC(l2arc_feed_min_ms, "Min feed interval in milliseconds");
+
+module_param(l2arc_noprefetch, int, 0644);
+MODULE_PARM_DESC(l2arc_noprefetch, "Skip caching prefetched buffers");
+
+module_param(l2arc_nocompress, int, 0644);
+MODULE_PARM_DESC(l2arc_nocompress, "Skip compressing L2ARC buffers");
+
+module_param(l2arc_feed_again, int, 0644);
+MODULE_PARM_DESC(l2arc_feed_again, "Turbo L2ARC warmup");
+
+module_param(l2arc_norw, int, 0644);
+MODULE_PARM_DESC(l2arc_norw, "No reads during writes");
+
+#endif