*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright 2011 Nexenta Systems, Inc. All rights reserved.
+ * Copyright (c) 2013 by Delphix. All rights reserved.
+ * Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
*/
/*
* tight.
*
* 3. The Megiddo and Modha model assumes a fixed page size. All
- * elements of the cache are therefor exactly the same size. So
+ * elements of the cache are therefore exactly the same size. So
* when adjusting the cache size following a cache miss, its simply
* a matter of choosing a single page to evict. In our model, we
* have variable sized cache blocks (rangeing from 512 bytes to
- * 128K bytes). We therefor choose a set of blocks to evict to make
+ * 128K bytes). We therefore choose a set of blocks to evict to make
* space for a cache miss that approximates as closely as possible
* the space used by the new block.
*
* ways: 1) via a hash table lookup using the DVA as a key,
* or 2) via one of the ARC lists. The arc_read() interface
* uses method 1, while the internal arc algorithms for
- * adjusting the cache use method 2. We therefor provide two
+ * adjusting the cache use method 2. We therefore provide two
* types of locks: 1) the hash table lock array, and 2) the
* arc list locks.
*
- * Buffers do not have their own mutexs, rather they rely on the
- * hash table mutexs for the bulk of their protection (i.e. most
- * fields in the arc_buf_hdr_t are protected by these mutexs).
+ * Buffers do not have their own mutexes, rather they rely on the
+ * hash table mutexes for the bulk of their protection (i.e. most
+ * fields in the arc_buf_hdr_t are protected by these mutexes).
*
* buf_hash_find() returns the appropriate mutex (held) when it
* locates the requested buffer in the hash table. It returns
#include <sys/spa.h>
#include <sys/zio.h>
+#include <sys/zio_compress.h>
#include <sys/zfs_context.h>
#include <sys/arc.h>
#include <sys/vdev.h>
#include <sys/vdev_impl.h>
+#include <sys/dsl_pool.h>
#ifdef _KERNEL
#include <sys/vmsystm.h>
#include <vm/anon.h>
#include <sys/dmu_tx.h>
#include <zfs_fletcher.h>
+#ifndef _KERNEL
+/* set with ZFS_DEBUG=watch, to enable watchpoints on frozen buffers */
+boolean_t arc_watch = B_FALSE;
+#endif
+
static kmutex_t arc_reclaim_thr_lock;
static kcondvar_t arc_reclaim_thr_cv; /* used to signal reclaim thr */
static uint8_t arc_thread_exit;
-extern int zfs_write_limit_shift;
-extern uint64_t zfs_write_limit_max;
-extern kmutex_t zfs_write_limit_lock;
-
/* number of bytes to prune from caches when at arc_meta_limit is reached */
-uint_t arc_meta_prune = 1048576;
+int zfs_arc_meta_prune = 1048576;
typedef enum arc_reclaim_strategy {
ARC_RECLAIM_AGGR, /* Aggressive reclaim strategy */
ARC_RECLAIM_CONS /* Conservative reclaim strategy */
} arc_reclaim_strategy_t;
-/* number of seconds before growing cache again */
-static int arc_grow_retry = 5;
+/*
+ * The number of iterations through arc_evict_*() before we
+ * drop & reacquire the lock.
+ */
+int arc_evict_iterations = 100;
-/* expiration time for arc_no_grow */
-static clock_t arc_grow_time = 0;
+/* number of seconds before growing cache again */
+int zfs_arc_grow_retry = 5;
/* shift of arc_c for calculating both min and max arc_p */
-static int arc_p_min_shift = 4;
+int zfs_arc_p_min_shift = 4;
/* log2(fraction of arc to reclaim) */
-static int arc_shrink_shift = 5;
+int zfs_arc_shrink_shift = 5;
/*
* minimum lifespan of a prefetch block in clock ticks
* (initialized in arc_init())
*/
-static int arc_min_prefetch_lifespan;
+int zfs_arc_min_prefetch_lifespan = HZ;
+
+/* disable arc proactive arc throttle due to low memory */
+int zfs_arc_memory_throttle_disable = 1;
+
+/* disable duplicate buffer eviction */
+int zfs_disable_dup_eviction = 0;
+
+/*
+ * If this percent of memory is free, don't throttle.
+ */
+int arc_lotsfree_percent = 10;
static int arc_dead;
+/* expiration time for arc_no_grow */
+static clock_t arc_grow_time = 0;
+
/*
* The arc has filled available memory and has now warmed up.
*/
unsigned long zfs_arc_max = 0;
unsigned long zfs_arc_min = 0;
unsigned long zfs_arc_meta_limit = 0;
-int zfs_arc_grow_retry = 0;
-int zfs_arc_shrink_shift = 0;
-int zfs_arc_p_min_shift = 0;
-int zfs_arc_meta_prune = 0;
/*
* Note that buffers can be in one of 6 states:
uint64_t arcs_lsize[ARC_BUFC_NUMTYPES]; /* amount of evictable data */
uint64_t arcs_size; /* total amount of data in this state */
kmutex_t arcs_mtx;
+ arc_state_type_t arcs_state;
} arc_state_t;
/* The 6 states: */
kstat_named_t arcstat_mfu_ghost_hits;
kstat_named_t arcstat_deleted;
kstat_named_t arcstat_recycle_miss;
+ /*
+ * Number of buffers that could not be evicted because the hash lock
+ * was held by another thread. The lock may not necessarily be held
+ * by something using the same buffer, since hash locks are shared
+ * by multiple buffers.
+ */
kstat_named_t arcstat_mutex_miss;
+ /*
+ * Number of buffers skipped because they have I/O in progress, are
+ * indrect prefetch buffers that have not lived long enough, or are
+ * not from the spa we're trying to evict from.
+ */
kstat_named_t arcstat_evict_skip;
kstat_named_t arcstat_evict_l2_cached;
kstat_named_t arcstat_evict_l2_eligible;
kstat_named_t arcstat_l2_cksum_bad;
kstat_named_t arcstat_l2_io_error;
kstat_named_t arcstat_l2_size;
+ kstat_named_t arcstat_l2_asize;
kstat_named_t arcstat_l2_hdr_size;
+ kstat_named_t arcstat_l2_compress_successes;
+ kstat_named_t arcstat_l2_compress_zeros;
+ kstat_named_t arcstat_l2_compress_failures;
kstat_named_t arcstat_memory_throttle_count;
+ kstat_named_t arcstat_duplicate_buffers;
+ kstat_named_t arcstat_duplicate_buffers_size;
+ kstat_named_t arcstat_duplicate_reads;
kstat_named_t arcstat_memory_direct_count;
kstat_named_t arcstat_memory_indirect_count;
kstat_named_t arcstat_no_grow;
{ "l2_cksum_bad", KSTAT_DATA_UINT64 },
{ "l2_io_error", KSTAT_DATA_UINT64 },
{ "l2_size", KSTAT_DATA_UINT64 },
+ { "l2_asize", KSTAT_DATA_UINT64 },
{ "l2_hdr_size", KSTAT_DATA_UINT64 },
+ { "l2_compress_successes", KSTAT_DATA_UINT64 },
+ { "l2_compress_zeros", KSTAT_DATA_UINT64 },
+ { "l2_compress_failures", KSTAT_DATA_UINT64 },
{ "memory_throttle_count", KSTAT_DATA_UINT64 },
+ { "duplicate_buffers", KSTAT_DATA_UINT64 },
+ { "duplicate_buffers_size", KSTAT_DATA_UINT64 },
+ { "duplicate_reads", KSTAT_DATA_UINT64 },
{ "memory_direct_count", KSTAT_DATA_UINT64 },
{ "memory_indirect_count", KSTAT_DATA_UINT64 },
{ "arc_no_grow", KSTAT_DATA_UINT64 },
#define ARCSTAT(stat) (arc_stats.stat.value.ui64)
#define ARCSTAT_INCR(stat, val) \
- atomic_add_64(&arc_stats.stat.value.ui64, (val));
+ atomic_add_64(&arc_stats.stat.value.ui64, (val))
#define ARCSTAT_BUMP(stat) ARCSTAT_INCR(stat, 1)
#define ARCSTAT_BUMPDOWN(stat) ARCSTAT_INCR(stat, -1)
#define arc_no_grow ARCSTAT(arcstat_no_grow)
#define arc_tempreserve ARCSTAT(arcstat_tempreserve)
#define arc_loaned_bytes ARCSTAT(arcstat_loaned_bytes)
-#define arc_meta_used ARCSTAT(arcstat_meta_used)
-#define arc_meta_limit ARCSTAT(arcstat_meta_limit)
-#define arc_meta_max ARCSTAT(arcstat_meta_max)
+#define arc_meta_limit ARCSTAT(arcstat_meta_limit) /* max size for metadata */
+#define arc_meta_used ARCSTAT(arcstat_meta_used) /* size of metadata */
+#define arc_meta_max ARCSTAT(arcstat_meta_max) /* max size of metadata */
+
+#define L2ARC_IS_VALID_COMPRESS(_c_) \
+ ((_c_) == ZIO_COMPRESS_LZ4 || (_c_) == ZIO_COMPRESS_EMPTY)
typedef struct l2arc_buf_hdr l2arc_buf_hdr_t;
struct arc_write_callback {
void *awcb_private;
arc_done_func_t *awcb_ready;
+ arc_done_func_t *awcb_physdone;
arc_done_func_t *awcb_done;
arc_buf_t *awcb_buf;
};
kmutex_t b_freeze_lock;
zio_cksum_t *b_freeze_cksum;
- void *b_thawed;
arc_buf_hdr_t *b_hash_next;
arc_buf_t *b_buf;
/* updated atomically */
clock_t b_arc_access;
+ uint32_t b_mru_hits;
+ uint32_t b_mru_ghost_hits;
+ uint32_t b_mfu_hits;
+ uint32_t b_mfu_ghost_hits;
+ uint32_t b_l2_hits;
/* self protecting */
refcount_t b_refcnt;
static void arc_get_data_buf(arc_buf_t *buf);
static void arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock);
static int arc_evict_needed(arc_buf_contents_t type);
-static void arc_evict_ghost(arc_state_t *state, uint64_t spa, int64_t bytes);
+static void arc_evict_ghost(arc_state_t *state, uint64_t spa, int64_t bytes,
+ arc_buf_contents_t type);
+static void arc_buf_watch(arc_buf_t *buf);
static boolean_t l2arc_write_eligible(uint64_t spa_guid, arc_buf_hdr_t *ab);
*/
#define L2ARC_WRITE_SIZE (8 * 1024 * 1024) /* initial write max */
-#define L2ARC_HEADROOM 2 /* num of writes */
+#define L2ARC_HEADROOM 2 /* num of writes */
+/*
+ * If we discover during ARC scan any buffers to be compressed, we boost
+ * our headroom for the next scanning cycle by this percentage multiple.
+ */
+#define L2ARC_HEADROOM_BOOST 200
#define L2ARC_FEED_SECS 1 /* caching interval secs */
#define L2ARC_FEED_MIN_MS 200 /* min caching interval ms */
#define l2arc_writes_sent ARCSTAT(arcstat_l2_writes_sent)
#define l2arc_writes_done ARCSTAT(arcstat_l2_writes_done)
-/*
- * L2ARC Performance Tunables
- */
+/* L2ARC Performance Tunables */
unsigned long l2arc_write_max = L2ARC_WRITE_SIZE; /* def max write size */
unsigned long l2arc_write_boost = L2ARC_WRITE_SIZE; /* extra warmup write */
unsigned long l2arc_headroom = L2ARC_HEADROOM; /* # of dev writes */
+unsigned long l2arc_headroom_boost = L2ARC_HEADROOM_BOOST;
unsigned long l2arc_feed_secs = L2ARC_FEED_SECS; /* interval seconds */
unsigned long l2arc_feed_min_ms = L2ARC_FEED_MIN_MS; /* min interval msecs */
int l2arc_noprefetch = B_TRUE; /* don't cache prefetch bufs */
+int l2arc_nocompress = B_FALSE; /* don't compress bufs */
int l2arc_feed_again = B_TRUE; /* turbo warmup */
-int l2arc_norw = B_TRUE; /* no reads during writes */
+int l2arc_norw = B_FALSE; /* no reads during writes */
/*
* L2ARC Internals
vdev_t *l2ad_vdev; /* vdev */
spa_t *l2ad_spa; /* spa */
uint64_t l2ad_hand; /* next write location */
- uint64_t l2ad_write; /* desired write size, bytes */
- uint64_t l2ad_boost; /* warmup write boost, bytes */
uint64_t l2ad_start; /* first addr on device */
uint64_t l2ad_end; /* last addr on device */
uint64_t l2ad_evict; /* last addr eviction reached */
static uint64_t l2arc_ndev; /* number of devices */
typedef struct l2arc_read_callback {
- arc_buf_t *l2rcb_buf; /* read buffer */
- spa_t *l2rcb_spa; /* spa */
- blkptr_t l2rcb_bp; /* original blkptr */
- zbookmark_t l2rcb_zb; /* original bookmark */
- int l2rcb_flags; /* original flags */
+ arc_buf_t *l2rcb_buf; /* read buffer */
+ spa_t *l2rcb_spa; /* spa */
+ blkptr_t l2rcb_bp; /* original blkptr */
+ zbookmark_t l2rcb_zb; /* original bookmark */
+ int l2rcb_flags; /* original flags */
+ enum zio_compress l2rcb_compress; /* applied compress */
} l2arc_read_callback_t;
typedef struct l2arc_write_callback {
struct l2arc_buf_hdr {
/* protected by arc_buf_hdr mutex */
- l2arc_dev_t *b_dev; /* L2ARC device */
- uint64_t b_daddr; /* disk address, offset byte */
+ l2arc_dev_t *b_dev; /* L2ARC device */
+ uint64_t b_daddr; /* disk address, offset byte */
+ /* compression applied to buffer data */
+ enum zio_compress b_compress;
+ /* real alloc'd buffer size depending on b_compress applied */
+ uint32_t b_asize;
+ uint32_t b_hits;
+ /* temporary buffer holder for in-flight compressed data */
+ void *b_tmp_cdata;
};
typedef struct l2arc_data_free {
static void l2arc_hdr_stat_add(void);
static void l2arc_hdr_stat_remove(void);
+static boolean_t l2arc_compress_buf(l2arc_buf_hdr_t *l2hdr);
+static void l2arc_decompress_zio(zio_t *zio, arc_buf_hdr_t *hdr,
+ enum zio_compress c);
+static void l2arc_release_cdata_buf(arc_buf_hdr_t *ab);
+
static uint64_t
buf_hash(uint64_t spa, const dva_t *dva, uint64_t birth)
{
int i;
#if defined(_KERNEL) && defined(HAVE_SPL)
- /* Large allocations which do not require contiguous pages
- * should be using vmem_free() in the linux kernel */
+ /*
+ * Large allocations which do not require contiguous pages
+ * should be using vmem_free() in the linux kernel\
+ */
vmem_free(buf_hash_table.ht_table,
(buf_hash_table.ht_mask + 1) * sizeof (void *));
#else
bzero(buf, sizeof (arc_buf_t));
mutex_init(&buf->b_evict_lock, NULL, MUTEX_DEFAULT, NULL);
- rw_init(&buf->b_data_lock, NULL, RW_DEFAULT, NULL);
arc_space_consume(sizeof (arc_buf_t), ARC_SPACE_HDRS);
return (0);
arc_buf_t *buf = vbuf;
mutex_destroy(&buf->b_evict_lock);
- rw_destroy(&buf->b_data_lock);
arc_space_return(sizeof (arc_buf_t), ARC_SPACE_HDRS);
}
retry:
buf_hash_table.ht_mask = hsize - 1;
#if defined(_KERNEL) && defined(HAVE_SPL)
- /* Large allocations which do not require contiguous pages
- * should be using vmem_alloc() in the linux kernel */
+ /*
+ * Large allocations which do not require contiguous pages
+ * should be using vmem_alloc() in the linux kernel
+ */
buf_hash_table.ht_table =
vmem_zalloc(hsize * sizeof (void*), KM_SLEEP);
#else
return;
}
buf->b_hdr->b_freeze_cksum = kmem_alloc(sizeof (zio_cksum_t),
- KM_PUSHPAGE);
+ KM_PUSHPAGE);
fletcher_2_native(buf->b_data, buf->b_hdr->b_size,
buf->b_hdr->b_freeze_cksum);
mutex_exit(&buf->b_hdr->b_freeze_lock);
+ arc_buf_watch(buf);
+}
+
+#ifndef _KERNEL
+void
+arc_buf_sigsegv(int sig, siginfo_t *si, void *unused)
+{
+ panic("Got SIGSEGV at address: 0x%lx\n", (long) si->si_addr);
+}
+#endif
+
+/* ARGSUSED */
+static void
+arc_buf_unwatch(arc_buf_t *buf)
+{
+#ifndef _KERNEL
+ if (arc_watch) {
+ ASSERT0(mprotect(buf->b_data, buf->b_hdr->b_size,
+ PROT_READ | PROT_WRITE));
+ }
+#endif
+}
+
+/* ARGSUSED */
+static void
+arc_buf_watch(arc_buf_t *buf)
+{
+#ifndef _KERNEL
+ if (arc_watch)
+ ASSERT0(mprotect(buf->b_data, buf->b_hdr->b_size, PROT_READ));
+#endif
}
void
buf->b_hdr->b_freeze_cksum = NULL;
}
- if (zfs_flags & ZFS_DEBUG_MODIFY) {
- if (buf->b_hdr->b_thawed)
- kmem_free(buf->b_hdr->b_thawed, 1);
- buf->b_hdr->b_thawed = kmem_alloc(1, KM_SLEEP);
- }
-
mutex_exit(&buf->b_hdr->b_freeze_lock);
+
+ arc_buf_unwatch(buf);
}
void
buf->b_hdr->b_state == arc_anon);
arc_cksum_compute(buf, B_FALSE);
mutex_exit(hash_lock);
+
}
static void
ASSERT(list_link_active(&ab->b_arc_node));
list_remove(list, ab);
if (GHOST_STATE(ab->b_state)) {
- ASSERT3U(ab->b_datacnt, ==, 0);
+ ASSERT0(ab->b_datacnt);
ASSERT3P(ab->b_buf, ==, NULL);
delta = ab->b_size;
}
return (cnt);
}
+/*
+ * Returns detailed information about a specific arc buffer. When the
+ * state_index argument is set the function will calculate the arc header
+ * list position for its arc state. Since this requires a linear traversal
+ * callers are strongly encourage not to do this. However, it can be helpful
+ * for targeted analysis so the functionality is provided.
+ */
+void
+arc_buf_info(arc_buf_t *ab, arc_buf_info_t *abi, int state_index)
+{
+ arc_buf_hdr_t *hdr = ab->b_hdr;
+ arc_state_t *state = hdr->b_state;
+
+ memset(abi, 0, sizeof (arc_buf_info_t));
+ abi->abi_flags = hdr->b_flags;
+ abi->abi_datacnt = hdr->b_datacnt;
+ abi->abi_state_type = state ? state->arcs_state : ARC_STATE_ANON;
+ abi->abi_state_contents = hdr->b_type;
+ abi->abi_state_index = -1;
+ abi->abi_size = hdr->b_size;
+ abi->abi_access = hdr->b_arc_access;
+ abi->abi_mru_hits = hdr->b_mru_hits;
+ abi->abi_mru_ghost_hits = hdr->b_mru_ghost_hits;
+ abi->abi_mfu_hits = hdr->b_mfu_hits;
+ abi->abi_mfu_ghost_hits = hdr->b_mfu_ghost_hits;
+ abi->abi_holds = refcount_count(&hdr->b_refcnt);
+
+ if (hdr->b_l2hdr) {
+ abi->abi_l2arc_dattr = hdr->b_l2hdr->b_daddr;
+ abi->abi_l2arc_asize = hdr->b_l2hdr->b_asize;
+ abi->abi_l2arc_compress = hdr->b_l2hdr->b_compress;
+ abi->abi_l2arc_hits = hdr->b_l2hdr->b_hits;
+ }
+
+ if (state && state_index && list_link_active(&hdr->b_arc_node)) {
+ list_t *list = &state->arcs_list[hdr->b_type];
+ arc_buf_hdr_t *h;
+
+ mutex_enter(&state->arcs_mtx);
+ for (h = list_head(list); h != NULL; h = list_next(list, h)) {
+ abi->abi_state_index++;
+ if (h == hdr)
+ break;
+ }
+ mutex_exit(&state->arcs_mtx);
+ }
+}
+
/*
* Move the supplied buffer to the indicated state. The mutex
* for the buffer must be held by the caller.
uint64_t from_delta, to_delta;
ASSERT(MUTEX_HELD(hash_lock));
- ASSERT(new_state != old_state);
+ ASSERT3P(new_state, !=, old_state);
ASSERT(refcnt == 0 || ab->b_datacnt > 0);
ASSERT(ab->b_datacnt == 0 || !GHOST_STATE(new_state));
ASSERT(ab->b_datacnt <= 1 || old_state != arc_anon);
break;
}
- atomic_add_64(&arc_meta_used, space);
+ ARCSTAT_INCR(arcstat_meta_used, space);
atomic_add_64(&arc_size, space);
}
ASSERT(arc_meta_used >= space);
if (arc_meta_max < arc_meta_used)
arc_meta_max = arc_meta_used;
- atomic_add_64(&arc_meta_used, -space);
+ ARCSTAT_INCR(arcstat_meta_used, -space);
ASSERT(arc_size >= space);
atomic_add_64(&arc_size, -space);
}
-void *
-arc_data_buf_alloc(uint64_t size)
-{
- if (arc_evict_needed(ARC_BUFC_DATA))
- cv_signal(&arc_reclaim_thr_cv);
- atomic_add_64(&arc_size, size);
- return (zio_data_buf_alloc(size));
-}
-
-void
-arc_data_buf_free(void *buf, uint64_t size)
-{
- zio_data_buf_free(buf, size);
- ASSERT(arc_size >= size);
- atomic_add_64(&arc_size, -size);
-}
-
arc_buf_t *
arc_buf_alloc(spa_t *spa, int size, void *tag, arc_buf_contents_t type)
{
ASSERT(BUF_EMPTY(hdr));
hdr->b_size = size;
hdr->b_type = type;
- hdr->b_spa = spa_guid(spa);
+ hdr->b_spa = spa_load_guid(spa);
hdr->b_state = arc_anon;
hdr->b_arc_access = 0;
+ hdr->b_mru_hits = 0;
+ hdr->b_mru_ghost_hits = 0;
+ hdr->b_mfu_hits = 0;
+ hdr->b_mfu_ghost_hits = 0;
+ hdr->b_l2_hits = 0;
buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE);
buf->b_hdr = hdr;
buf->b_data = NULL;
hdr->b_buf = buf;
arc_get_data_buf(buf);
bcopy(from->b_data, buf->b_data, size);
+
+ /*
+ * This buffer already exists in the arc so create a duplicate
+ * copy for the caller. If the buffer is associated with user data
+ * then track the size and number of duplicates. These stats will be
+ * updated as duplicate buffers are created and destroyed.
+ */
+ if (hdr->b_type == ARC_BUFC_DATA) {
+ ARCSTAT_BUMP(arcstat_duplicate_buffers);
+ ARCSTAT_INCR(arcstat_duplicate_buffers_size, size);
+ }
hdr->b_datacnt += 1;
return (buf);
}
* the buffer is placed on l2arc_free_on_write to be freed later.
*/
static void
-arc_buf_data_free(arc_buf_hdr_t *hdr, void (*free_func)(void *, size_t),
- void *data, size_t size)
+arc_buf_data_free(arc_buf_t *buf, void (*free_func)(void *, size_t))
{
+ arc_buf_hdr_t *hdr = buf->b_hdr;
+
if (HDR_L2_WRITING(hdr)) {
l2arc_data_free_t *df;
- df = kmem_alloc(sizeof (l2arc_data_free_t), KM_SLEEP);
- df->l2df_data = data;
- df->l2df_size = size;
+ df = kmem_alloc(sizeof (l2arc_data_free_t), KM_PUSHPAGE);
+ df->l2df_data = buf->b_data;
+ df->l2df_size = hdr->b_size;
df->l2df_func = free_func;
mutex_enter(&l2arc_free_on_write_mtx);
list_insert_head(l2arc_free_on_write, df);
mutex_exit(&l2arc_free_on_write_mtx);
ARCSTAT_BUMP(arcstat_l2_free_on_write);
} else {
- free_func(data, size);
+ free_func(buf->b_data, hdr->b_size);
}
}
arc_buf_contents_t type = buf->b_hdr->b_type;
arc_cksum_verify(buf);
+ arc_buf_unwatch(buf);
if (!recycle) {
if (type == ARC_BUFC_METADATA) {
- arc_buf_data_free(buf->b_hdr, zio_buf_free,
- buf->b_data, size);
+ arc_buf_data_free(buf, zio_buf_free);
arc_space_return(size, ARC_SPACE_DATA);
} else {
ASSERT(type == ARC_BUFC_DATA);
- arc_buf_data_free(buf->b_hdr,
- zio_data_buf_free, buf->b_data, size);
+ arc_buf_data_free(buf, zio_data_buf_free);
ARCSTAT_INCR(arcstat_data_size, -size);
atomic_add_64(&arc_size, -size);
}
ASSERT3U(state->arcs_size, >=, size);
atomic_add_64(&state->arcs_size, -size);
buf->b_data = NULL;
+
+ /*
+ * If we're destroying a duplicate buffer make sure
+ * that the appropriate statistics are updated.
+ */
+ if (buf->b_hdr->b_datacnt > 1 &&
+ buf->b_hdr->b_type == ARC_BUFC_DATA) {
+ ARCSTAT_BUMPDOWN(arcstat_duplicate_buffers);
+ ARCSTAT_INCR(arcstat_duplicate_buffers_size, -size);
+ }
ASSERT(buf->b_hdr->b_datacnt > 0);
buf->b_hdr->b_datacnt -= 1;
}
if (l2hdr != NULL) {
list_remove(l2hdr->b_dev->l2ad_buflist, hdr);
ARCSTAT_INCR(arcstat_l2_size, -hdr->b_size);
+ ARCSTAT_INCR(arcstat_l2_asize, -l2hdr->b_asize);
kmem_free(l2hdr, sizeof (l2arc_buf_hdr_t));
+ arc_space_return(L2HDR_SIZE, ARC_SPACE_L2HDRS);
if (hdr->b_state == arc_l2c_only)
l2arc_hdr_stat_remove();
hdr->b_l2hdr = NULL;
kmem_free(hdr->b_freeze_cksum, sizeof (zio_cksum_t));
hdr->b_freeze_cksum = NULL;
}
- if (hdr->b_thawed) {
- kmem_free(hdr->b_thawed, 1);
- hdr->b_thawed = NULL;
- }
ASSERT(!list_link_active(&hdr->b_arc_node));
ASSERT3P(hdr->b_hash_next, ==, NULL);
}
}
-int
+boolean_t
arc_buf_remove_ref(arc_buf_t *buf, void* tag)
{
arc_buf_hdr_t *hdr = buf->b_hdr;
- kmutex_t *hash_lock = HDR_LOCK(hdr);
- int no_callback = (buf->b_efunc == NULL);
+ kmutex_t *hash_lock = NULL;
+ boolean_t no_callback = (buf->b_efunc == NULL);
if (hdr->b_state == arc_anon) {
ASSERT(hdr->b_datacnt == 1);
return (no_callback);
}
+ hash_lock = HDR_LOCK(hdr);
mutex_enter(hash_lock);
hdr = buf->b_hdr;
ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
return (buf->b_hdr->b_size);
}
+/*
+ * Called from the DMU to determine if the current buffer should be
+ * evicted. In order to ensure proper locking, the eviction must be initiated
+ * from the DMU. Return true if the buffer is associated with user data and
+ * duplicate buffers still exist.
+ */
+boolean_t
+arc_buf_eviction_needed(arc_buf_t *buf)
+{
+ arc_buf_hdr_t *hdr;
+ boolean_t evict_needed = B_FALSE;
+
+ if (zfs_disable_dup_eviction)
+ return (B_FALSE);
+
+ mutex_enter(&buf->b_evict_lock);
+ hdr = buf->b_hdr;
+ if (hdr == NULL) {
+ /*
+ * We are in arc_do_user_evicts(); let that function
+ * perform the eviction.
+ */
+ ASSERT(buf->b_data == NULL);
+ mutex_exit(&buf->b_evict_lock);
+ return (B_FALSE);
+ } else if (buf->b_data == NULL) {
+ /*
+ * We have already been added to the arc eviction list;
+ * recommend eviction.
+ */
+ ASSERT3P(hdr, ==, &arc_eviction_hdr);
+ mutex_exit(&buf->b_evict_lock);
+ return (B_TRUE);
+ }
+
+ if (hdr->b_datacnt > 1 && hdr->b_type == ARC_BUFC_DATA)
+ evict_needed = B_TRUE;
+
+ mutex_exit(&buf->b_evict_lock);
+ return (evict_needed);
+}
+
/*
* Evict buffers from list until we've removed the specified number of
* bytes. Move the removed buffers to the appropriate evict state.
kmutex_t *hash_lock;
boolean_t have_lock;
void *stolen = NULL;
+ arc_buf_hdr_t marker = {{{ 0 }}};
+ int count = 0;
ASSERT(state == arc_mru || state == arc_mfu);
(spa && ab->b_spa != spa) ||
(ab->b_flags & (ARC_PREFETCH|ARC_INDIRECT) &&
ddi_get_lbolt() - ab->b_arc_access <
- arc_min_prefetch_lifespan)) {
+ zfs_arc_min_prefetch_lifespan)) {
skipped++;
continue;
}
if (recycle && ab->b_size != bytes &&
ab_prev && ab_prev->b_size == bytes)
continue;
+
+ /* ignore markers */
+ if (ab->b_spa == 0)
+ continue;
+
+ /*
+ * It may take a long time to evict all the bufs requested.
+ * To avoid blocking all arc activity, periodically drop
+ * the arcs_mtx and give other threads a chance to run
+ * before reacquiring the lock.
+ *
+ * If we are looking for a buffer to recycle, we are in
+ * the hot code path, so don't sleep.
+ */
+ if (!recycle && count++ > arc_evict_iterations) {
+ list_insert_after(list, ab, &marker);
+ mutex_exit(&evicted_state->arcs_mtx);
+ mutex_exit(&state->arcs_mtx);
+ kpreempt(KPREEMPT_SYNC);
+ mutex_enter(&state->arcs_mtx);
+ mutex_enter(&evicted_state->arcs_mtx);
+ ab_prev = list_prev(list, &marker);
+ list_remove(list, &marker);
+ count = 0;
+ continue;
+ }
+
hash_lock = HDR_LOCK(ab);
have_lock = MUTEX_HELD(hash_lock);
if (have_lock || mutex_tryenter(hash_lock)) {
- ASSERT3U(refcount_count(&ab->b_refcnt), ==, 0);
+ ASSERT0(refcount_count(&ab->b_refcnt));
ASSERT(ab->b_datacnt > 0);
while (ab->b_buf) {
arc_buf_t *buf = ab->b_buf;
ARCSTAT_INCR(arcstat_mutex_miss, missed);
/*
- * We have just evicted some date into the ghost state, make
- * sure we also adjust the ghost state size if necessary.
+ * Note: we have just evicted some data into the ghost state,
+ * potentially putting the ghost size over the desired size. Rather
+ * that evicting from the ghost list in this hot code path, leave
+ * this chore to the arc_reclaim_thread().
*/
- if (arc_no_grow &&
- arc_mru_ghost->arcs_size + arc_mfu_ghost->arcs_size > arc_c) {
- int64_t mru_over = arc_anon->arcs_size + arc_mru->arcs_size +
- arc_mru_ghost->arcs_size - arc_c;
-
- if (mru_over > 0 && arc_mru_ghost->arcs_lsize[type] > 0) {
- int64_t todelete =
- MIN(arc_mru_ghost->arcs_lsize[type], mru_over);
- arc_evict_ghost(arc_mru_ghost, 0, todelete);
- } else if (arc_mfu_ghost->arcs_lsize[type] > 0) {
- int64_t todelete = MIN(arc_mfu_ghost->arcs_lsize[type],
- arc_mru_ghost->arcs_size +
- arc_mfu_ghost->arcs_size - arc_c);
- arc_evict_ghost(arc_mfu_ghost, 0, todelete);
- }
- }
return (stolen);
}
* bytes. Destroy the buffers that are removed.
*/
static void
-arc_evict_ghost(arc_state_t *state, uint64_t spa, int64_t bytes)
+arc_evict_ghost(arc_state_t *state, uint64_t spa, int64_t bytes,
+ arc_buf_contents_t type)
{
arc_buf_hdr_t *ab, *ab_prev;
arc_buf_hdr_t marker;
- list_t *list = &state->arcs_list[ARC_BUFC_DATA];
+ list_t *list = &state->arcs_list[type];
kmutex_t *hash_lock;
uint64_t bytes_deleted = 0;
uint64_t bufs_skipped = 0;
+ int count = 0;
ASSERT(GHOST_STATE(state));
- bzero(&marker, sizeof(marker));
+ bzero(&marker, sizeof (marker));
top:
mutex_enter(&state->arcs_mtx);
for (ab = list_tail(list); ab; ab = ab_prev) {
ab_prev = list_prev(list, ab);
+ if (ab->b_type > ARC_BUFC_NUMTYPES)
+ panic("invalid ab=%p", (void *)ab);
if (spa && ab->b_spa != spa)
continue;
/* caller may be trying to modify this buffer, skip it */
if (MUTEX_HELD(hash_lock))
continue;
+
+ /*
+ * It may take a long time to evict all the bufs requested.
+ * To avoid blocking all arc activity, periodically drop
+ * the arcs_mtx and give other threads a chance to run
+ * before reacquiring the lock.
+ */
+ if (count++ > arc_evict_iterations) {
+ list_insert_after(list, ab, &marker);
+ mutex_exit(&state->arcs_mtx);
+ kpreempt(KPREEMPT_SYNC);
+ mutex_enter(&state->arcs_mtx);
+ ab_prev = list_prev(list, &marker);
+ list_remove(list, &marker);
+ count = 0;
+ continue;
+ }
if (mutex_tryenter(hash_lock)) {
ASSERT(!HDR_IO_IN_PROGRESS(ab));
ASSERT(ab->b_buf == NULL);
mutex_enter(&state->arcs_mtx);
ab_prev = list_prev(list, &marker);
list_remove(list, &marker);
- } else
+ } else {
bufs_skipped += 1;
+ }
}
mutex_exit(&state->arcs_mtx);
if (adjustment > 0 && arc_mru_ghost->arcs_size > 0) {
delta = MIN(arc_mru_ghost->arcs_size, adjustment);
- arc_evict_ghost(arc_mru_ghost, 0, delta);
+ arc_evict_ghost(arc_mru_ghost, 0, delta, ARC_BUFC_DATA);
}
adjustment =
if (adjustment > 0 && arc_mfu_ghost->arcs_size > 0) {
delta = MIN(arc_mfu_ghost->arcs_size, adjustment);
- arc_evict_ghost(arc_mfu_ghost, 0, delta);
+ arc_evict_ghost(arc_mfu_ghost, 0, delta, ARC_BUFC_DATA);
}
}
}
if (may_prune && (adjustment > 0) && (arc_meta_used > arc_meta_limit))
- arc_do_user_prune(arc_meta_prune);
+ arc_do_user_prune(zfs_arc_meta_prune);
}
/*
uint64_t guid = 0;
if (spa)
- guid = spa_guid(spa);
+ guid = spa_load_guid(spa);
while (list_head(&arc_mru->arcs_list[ARC_BUFC_DATA])) {
(void) arc_evict(arc_mru, guid, -1, FALSE, ARC_BUFC_DATA);
break;
}
- arc_evict_ghost(arc_mru_ghost, guid, -1);
- arc_evict_ghost(arc_mfu_ghost, guid, -1);
+ arc_evict_ghost(arc_mru_ghost, guid, -1, ARC_BUFC_DATA);
+ arc_evict_ghost(arc_mfu_ghost, guid, -1, ARC_BUFC_DATA);
mutex_enter(&arc_reclaim_thr_lock);
arc_do_user_evicts();
if (arc_c > arc_c_min) {
uint64_t to_free;
- to_free = bytes ? bytes : arc_c >> arc_shrink_shift;
+ to_free = bytes ? bytes : arc_c >> zfs_arc_shrink_shift;
if (arc_c > arc_c_min + to_free)
atomic_add_64(&arc_c, -to_free);
else
arc_c = arc_c_min;
- atomic_add_64(&arc_p, -(arc_p >> arc_shrink_shift));
+ atomic_add_64(&arc_p, -(arc_p >> zfs_arc_shrink_shift));
if (arc_c > arc_size)
arc_c = MAX(arc_size, arc_c_min);
if (arc_p > arc_c)
}
/* reset the growth delay for every reclaim */
- arc_grow_time = ddi_get_lbolt()+(arc_grow_retry * hz);
+ arc_grow_time = ddi_get_lbolt() +
+ (zfs_arc_grow_retry * hz);
arc_kmem_reap_now(last_reclaim, 0);
arc_warm = B_TRUE;
(void) cv_timedwait_interruptible(&arc_reclaim_thr_cv,
&arc_reclaim_thr_lock, (ddi_get_lbolt() + hz));
CALLB_CPR_SAFE_END(&cpr, &arc_reclaim_thr_lock);
+
+
+ /* Allow the module options to be changed */
+ if (zfs_arc_max > 64 << 20 &&
+ zfs_arc_max < physmem * PAGESIZE &&
+ zfs_arc_max != arc_c_max)
+ arc_c_max = zfs_arc_max;
+
+ if (zfs_arc_min > 0 &&
+ zfs_arc_min < arc_c_max &&
+ zfs_arc_min != arc_c_min)
+ arc_c_min = zfs_arc_min;
+
+ if (zfs_arc_meta_limit > 0 &&
+ zfs_arc_meta_limit <= arc_c_max &&
+ zfs_arc_meta_limit != arc_meta_limit)
+ arc_meta_limit = zfs_arc_meta_limit;
+
+
+
}
arc_thread_exit = 0;
*/
if (pages > 0) {
arc_kmem_reap_now(ARC_RECLAIM_AGGR, ptob(sc->nr_to_scan));
- pages = btop(arc_evictable_memory());
} else {
arc_kmem_reap_now(ARC_RECLAIM_CONS, ptob(sc->nr_to_scan));
- pages = -1;
}
/*
ARCSTAT_BUMP(arcstat_memory_indirect_count);
} else {
arc_no_grow = B_TRUE;
- arc_grow_time = ddi_get_lbolt() + (arc_grow_retry * hz);
+ arc_grow_time = ddi_get_lbolt() + (zfs_arc_grow_retry * hz);
ARCSTAT_BUMP(arcstat_memory_direct_count);
}
mutex_exit(&arc_reclaim_thr_lock);
- return (pages);
+ return (-1);
}
SPL_SHRINKER_CALLBACK_WRAPPER(arc_shrinker_func);
arc_adapt(int bytes, arc_state_t *state)
{
int mult;
- uint64_t arc_p_min = (arc_c >> arc_p_min_shift);
+ uint64_t arc_p_min = (arc_c >> zfs_arc_p_min_shift);
if (state == arc_l2c_only)
return;
if (type == ARC_BUFC_METADATA && arc_meta_used >= arc_meta_limit)
return (1);
-#ifdef _KERNEL
- /*
- * If zio data pages are being allocated out of a separate heap segment,
- * then enforce that the size of available vmem for this area remains
- * above about 1/32nd free.
- */
- if (type == ARC_BUFC_DATA && zio_arena != NULL &&
- vmem_size(zio_arena, VMEM_FREE) <
- (vmem_size(zio_arena, VMEM_ALLOC) >> 5))
- return (1);
-#endif
-
if (arc_no_grow)
return (1);
ASSERT(list_link_active(&buf->b_arc_node));
} else {
buf->b_flags &= ~ARC_PREFETCH;
+ atomic_inc_32(&buf->b_mru_hits);
ARCSTAT_BUMP(arcstat_mru_hits);
}
buf->b_arc_access = now;
DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf);
arc_change_state(arc_mfu, buf, hash_lock);
}
+ atomic_inc_32(&buf->b_mru_hits);
ARCSTAT_BUMP(arcstat_mru_hits);
} else if (buf->b_state == arc_mru_ghost) {
arc_state_t *new_state;
buf->b_arc_access = ddi_get_lbolt();
arc_change_state(new_state, buf, hash_lock);
+ atomic_inc_32(&buf->b_mru_ghost_hits);
ARCSTAT_BUMP(arcstat_mru_ghost_hits);
} else if (buf->b_state == arc_mfu) {
/*
ASSERT(refcount_count(&buf->b_refcnt) == 0);
ASSERT(list_link_active(&buf->b_arc_node));
}
+ atomic_inc_32(&buf->b_mfu_hits);
ARCSTAT_BUMP(arcstat_mfu_hits);
buf->b_arc_access = ddi_get_lbolt();
} else if (buf->b_state == arc_mfu_ghost) {
* This is a prefetch access...
* move this block back to the MRU state.
*/
- ASSERT3U(refcount_count(&buf->b_refcnt), ==, 0);
+ ASSERT0(refcount_count(&buf->b_refcnt));
new_state = arc_mru;
}
DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf);
arc_change_state(new_state, buf, hash_lock);
+ atomic_inc_32(&buf->b_mfu_ghost_hits);
ARCSTAT_BUMP(arcstat_mfu_ghost_hits);
} else if (buf->b_state == arc_l2c_only) {
/*
{
if (zio == NULL || zio->io_error == 0)
bcopy(buf->b_data, arg, buf->b_hdr->b_size);
- VERIFY(arc_buf_remove_ref(buf, arg) == 1);
+ VERIFY(arc_buf_remove_ref(buf, arg));
}
/* a generic arc_done_func_t */
{
arc_buf_t **bufp = arg;
if (zio && zio->io_error) {
- VERIFY(arc_buf_remove_ref(buf, arg) == 1);
+ VERIFY(arc_buf_remove_ref(buf, arg));
*bufp = NULL;
} else {
*bufp = buf;
callback_list = hdr->b_acb;
ASSERT(callback_list != NULL);
if (BP_SHOULD_BYTESWAP(zio->io_bp) && zio->io_error == 0) {
- arc_byteswap_func_t *func = BP_GET_LEVEL(zio->io_bp) > 0 ?
- byteswap_uint64_array :
- dmu_ot[BP_GET_TYPE(zio->io_bp)].ot_byteswap;
- func(buf->b_data, hdr->b_size);
+ dmu_object_byteswap_t bswap =
+ DMU_OT_BYTESWAP(BP_GET_TYPE(zio->io_bp));
+ if (BP_GET_LEVEL(zio->io_bp) > 0)
+ byteswap_uint64_array(buf->b_data, hdr->b_size);
+ else
+ dmu_ot_byteswap[bswap].ob_func(buf->b_data, hdr->b_size);
}
arc_cksum_compute(buf, B_FALSE);
+ arc_buf_watch(buf);
if (hash_lock && zio->io_error == 0 && hdr->b_state == arc_anon) {
/*
abuf = buf;
for (acb = callback_list; acb; acb = acb->acb_next) {
if (acb->acb_done) {
- if (abuf == NULL)
+ if (abuf == NULL) {
+ ARCSTAT_BUMP(arcstat_duplicate_reads);
abuf = arc_buf_clone(buf);
+ }
acb->acb_buf = abuf;
abuf = NULL;
}
}
/*
- * "Read" the block block at the specified DVA (in bp) via the
+ * "Read" the block at the specified DVA (in bp) via the
* cache. If the block is found in the cache, invoke the provided
* callback immediately and return. Note that the `zio' parameter
* in the callback will be NULL in this case, since no IO was
*
* arc_read_done() will invoke all the requested "done" functions
* for readers of this block.
- *
- * Normal callers should use arc_read and pass the arc buffer and offset
- * for the bp. But if you know you don't need locking, you can use
- * arc_read_bp.
*/
int
-arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_buf_t *pbuf,
- arc_done_func_t *done, void *private, int priority, int zio_flags,
- uint32_t *arc_flags, const zbookmark_t *zb)
-{
- int err;
-
- if (pbuf == NULL) {
- /*
- * XXX This happens from traverse callback funcs, for
- * the objset_phys_t block.
- */
- return (arc_read_nolock(pio, spa, bp, done, private, priority,
- zio_flags, arc_flags, zb));
- }
-
- ASSERT(!refcount_is_zero(&pbuf->b_hdr->b_refcnt));
- ASSERT3U((char *)bp - (char *)pbuf->b_data, <, pbuf->b_hdr->b_size);
- rw_enter(&pbuf->b_data_lock, RW_READER);
-
- err = arc_read_nolock(pio, spa, bp, done, private, priority,
- zio_flags, arc_flags, zb);
- rw_exit(&pbuf->b_data_lock);
-
- return (err);
-}
-
-int
-arc_read_nolock(zio_t *pio, spa_t *spa, const blkptr_t *bp,
- arc_done_func_t *done, void *private, int priority, int zio_flags,
- uint32_t *arc_flags, const zbookmark_t *zb)
+arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_done_func_t *done,
+ void *private, zio_priority_t priority, int zio_flags, uint32_t *arc_flags,
+ const zbookmark_t *zb)
{
arc_buf_hdr_t *hdr;
arc_buf_t *buf = NULL;
kmutex_t *hash_lock;
zio_t *rzio;
- uint64_t guid = spa_guid(spa);
+ uint64_t guid = spa_load_guid(spa);
+ int rc = 0;
top:
hdr = buf_hash_find(guid, BP_IDENTITY(bp), BP_PHYSICAL_BIRTH(bp),
hdr->b_acb = acb;
add_reference(hdr, hash_lock, private);
mutex_exit(hash_lock);
- return (0);
+ goto out;
}
mutex_exit(hash_lock);
- return (0);
+ goto out;
}
ASSERT(hdr->b_state == arc_mru || hdr->b_state == arc_mfu);
arc_access(hdr, hash_lock);
if (*arc_flags & ARC_L2CACHE)
hdr->b_flags |= ARC_L2CACHE;
+ if (*arc_flags & ARC_L2COMPRESS)
+ hdr->b_flags |= ARC_L2COMPRESS;
mutex_exit(hash_lock);
ARCSTAT_BUMP(arcstat_hits);
ARCSTAT_CONDSTAT(!(hdr->b_flags & ARC_PREFETCH),
uint64_t size = BP_GET_LSIZE(bp);
arc_callback_t *acb;
vdev_t *vd = NULL;
- uint64_t addr = -1;
+ uint64_t addr = 0;
boolean_t devw = B_FALSE;
if (hdr == NULL) {
}
if (*arc_flags & ARC_L2CACHE)
hdr->b_flags |= ARC_L2CACHE;
+ if (*arc_flags & ARC_L2COMPRESS)
+ hdr->b_flags |= ARC_L2COMPRESS;
if (BP_GET_LEVEL(bp) > 0)
hdr->b_flags |= ARC_INDIRECT;
} else {
/* this block is in the ghost cache */
ASSERT(GHOST_STATE(hdr->b_state));
ASSERT(!HDR_IO_IN_PROGRESS(hdr));
- ASSERT3U(refcount_count(&hdr->b_refcnt), ==, 0);
+ ASSERT0(refcount_count(&hdr->b_refcnt));
ASSERT(hdr->b_buf == NULL);
/* if this is a prefetch, we don't have a reference */
add_reference(hdr, hash_lock, private);
if (*arc_flags & ARC_L2CACHE)
hdr->b_flags |= ARC_L2CACHE;
+ if (*arc_flags & ARC_L2COMPRESS)
+ hdr->b_flags |= ARC_L2COMPRESS;
buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE);
buf->b_hdr = hdr;
buf->b_data = NULL;
mutex_exit(hash_lock);
+ /*
+ * At this point, we have a level 1 cache miss. Try again in
+ * L2ARC if possible.
+ */
ASSERT3U(hdr->b_size, ==, size);
DTRACE_PROBE4(arc__miss, arc_buf_hdr_t *, hdr, blkptr_t *, bp,
uint64_t, size, zbookmark_t *, zb);
DTRACE_PROBE1(l2arc__hit, arc_buf_hdr_t *, hdr);
ARCSTAT_BUMP(arcstat_l2_hits);
+ atomic_inc_32(&hdr->b_l2hdr->b_hits);
cb = kmem_zalloc(sizeof (l2arc_read_callback_t),
KM_PUSHPAGE);
cb->l2rcb_bp = *bp;
cb->l2rcb_zb = *zb;
cb->l2rcb_flags = zio_flags;
+ cb->l2rcb_compress = hdr->b_l2hdr->b_compress;
+
+ ASSERT(addr >= VDEV_LABEL_START_SIZE &&
+ addr + size < vd->vdev_psize -
+ VDEV_LABEL_END_SIZE);
/*
* l2arc read. The SCL_L2ARC lock will be
* released by l2arc_read_done().
+ * Issue a null zio if the underlying buffer
+ * was squashed to zero size by compression.
*/
- rzio = zio_read_phys(pio, vd, addr, size,
- buf->b_data, ZIO_CHECKSUM_OFF,
- l2arc_read_done, cb, priority, zio_flags |
- ZIO_FLAG_DONT_CACHE | ZIO_FLAG_CANFAIL |
- ZIO_FLAG_DONT_PROPAGATE |
- ZIO_FLAG_DONT_RETRY, B_FALSE);
+ if (hdr->b_l2hdr->b_compress ==
+ ZIO_COMPRESS_EMPTY) {
+ rzio = zio_null(pio, spa, vd,
+ l2arc_read_done, cb,
+ zio_flags | ZIO_FLAG_DONT_CACHE |
+ ZIO_FLAG_CANFAIL |
+ ZIO_FLAG_DONT_PROPAGATE |
+ ZIO_FLAG_DONT_RETRY);
+ } else {
+ rzio = zio_read_phys(pio, vd, addr,
+ hdr->b_l2hdr->b_asize,
+ buf->b_data, ZIO_CHECKSUM_OFF,
+ l2arc_read_done, cb, priority,
+ zio_flags | ZIO_FLAG_DONT_CACHE |
+ ZIO_FLAG_CANFAIL |
+ ZIO_FLAG_DONT_PROPAGATE |
+ ZIO_FLAG_DONT_RETRY, B_FALSE);
+ }
DTRACE_PROBE2(l2arc__read, vdev_t *, vd,
zio_t *, rzio);
- ARCSTAT_INCR(arcstat_l2_read_bytes, size);
+ ARCSTAT_INCR(arcstat_l2_read_bytes,
+ hdr->b_l2hdr->b_asize);
if (*arc_flags & ARC_NOWAIT) {
zio_nowait(rzio);
- return (0);
+ goto out;
}
ASSERT(*arc_flags & ARC_WAIT);
if (zio_wait(rzio) == 0)
- return (0);
+ goto out;
/* l2arc read error; goto zio_read() */
} else {
rzio = zio_read(pio, spa, bp, buf->b_data, size,
arc_read_done, buf, priority, zio_flags, zb);
- if (*arc_flags & ARC_WAIT)
- return (zio_wait(rzio));
+ if (*arc_flags & ARC_WAIT) {
+ rc = zio_wait(rzio);
+ goto out;
+ }
ASSERT(*arc_flags & ARC_NOWAIT);
zio_nowait(rzio);
}
- return (0);
+
+out:
+ spa_read_history_add(spa, zb, *arc_flags);
+ return (rc);
}
arc_prune_t *
{
arc_prune_t *p;
- p = kmem_alloc(sizeof(*p), KM_SLEEP);
+ p = kmem_alloc(sizeof (*p), KM_SLEEP);
p->p_pfunc = func;
p->p_private = private;
list_link_init(&p->p_node);
buf->b_private = private;
}
+/*
+ * Notify the arc that a block was freed, and thus will never be used again.
+ */
+void
+arc_freed(spa_t *spa, const blkptr_t *bp)
+{
+ arc_buf_hdr_t *hdr;
+ kmutex_t *hash_lock;
+ uint64_t guid = spa_load_guid(spa);
+
+ hdr = buf_hash_find(guid, BP_IDENTITY(bp), BP_PHYSICAL_BIRTH(bp),
+ &hash_lock);
+ if (hdr == NULL)
+ return;
+ if (HDR_BUF_AVAILABLE(hdr)) {
+ arc_buf_t *buf = hdr->b_buf;
+ add_reference(hdr, hash_lock, FTAG);
+ hdr->b_flags &= ~ARC_BUF_AVAILABLE;
+ mutex_exit(hash_lock);
+
+ arc_release(buf, FTAG);
+ (void) arc_buf_remove_ref(buf, FTAG);
+ } else {
+ mutex_exit(hash_lock);
+ }
+
+}
+
/*
* This is used by the DMU to let the ARC know that a buffer is
* being evicted, so the ARC should clean up. If this arc buf
}
/*
- * Release this buffer from the cache. This must be done
- * after a read and prior to modifying the buffer contents.
+ * Release this buffer from the cache, making it an anonymous buffer. This
+ * must be done after a read and prior to modifying the buffer contents.
* If the buffer has more than one reference, we must make
* a new hdr for the buffer.
*/
if (l2hdr) {
mutex_enter(&l2arc_buflist_mtx);
hdr->b_l2hdr = NULL;
- buf_size = hdr->b_size;
}
+ buf_size = hdr->b_size;
/*
* Do we have more than one buf?
ASSERT3U(*size, >=, hdr->b_size);
atomic_add_64(size, -hdr->b_size);
}
+
+ /*
+ * We're releasing a duplicate user data buffer, update
+ * our statistics accordingly.
+ */
+ if (hdr->b_type == ARC_BUFC_DATA) {
+ ARCSTAT_BUMPDOWN(arcstat_duplicate_buffers);
+ ARCSTAT_INCR(arcstat_duplicate_buffers_size,
+ -hdr->b_size);
+ }
hdr->b_datacnt -= 1;
arc_cksum_verify(buf);
+ arc_buf_unwatch(buf);
mutex_exit(hash_lock);
nhdr->b_buf = buf;
nhdr->b_state = arc_anon;
nhdr->b_arc_access = 0;
+ nhdr->b_mru_hits = 0;
+ nhdr->b_mru_ghost_hits = 0;
+ nhdr->b_mfu_hits = 0;
+ nhdr->b_mfu_ghost_hits = 0;
+ nhdr->b_l2_hits = 0;
nhdr->b_flags = flags & ARC_L2_WRITING;
nhdr->b_l2hdr = NULL;
nhdr->b_datacnt = 1;
if (hdr->b_state != arc_anon)
arc_change_state(arc_anon, hdr, hash_lock);
hdr->b_arc_access = 0;
+ hdr->b_mru_hits = 0;
+ hdr->b_mru_ghost_hits = 0;
+ hdr->b_mfu_hits = 0;
+ hdr->b_mfu_ghost_hits = 0;
+ hdr->b_l2_hits = 0;
if (hash_lock)
mutex_exit(hash_lock);
buf->b_private = NULL;
if (l2hdr) {
+ ARCSTAT_INCR(arcstat_l2_asize, -l2hdr->b_asize);
list_remove(l2hdr->b_dev->l2ad_buflist, hdr);
kmem_free(l2hdr, sizeof (l2arc_buf_hdr_t));
+ arc_space_return(L2HDR_SIZE, ARC_SPACE_L2HDRS);
ARCSTAT_INCR(arcstat_l2_size, -buf_size);
mutex_exit(&l2arc_buflist_mtx);
}
}
-/*
- * Release this buffer. If it does not match the provided BP, fill it
- * with that block's contents.
- */
-/* ARGSUSED */
-int
-arc_release_bp(arc_buf_t *buf, void *tag, blkptr_t *bp, spa_t *spa,
- zbookmark_t *zb)
-{
- arc_release(buf, tag);
- return (0);
-}
-
int
arc_released(arc_buf_t *buf)
{
hdr->b_flags |= ARC_IO_IN_PROGRESS;
}
+/*
+ * The SPA calls this callback for each physical write that happens on behalf
+ * of a logical write. See the comment in dbuf_write_physdone() for details.
+ */
+static void
+arc_write_physdone(zio_t *zio)
+{
+ arc_write_callback_t *cb = zio->io_private;
+ if (cb->awcb_physdone != NULL)
+ cb->awcb_physdone(zio, cb->awcb_buf, cb->awcb_private);
+}
+
static void
arc_write_done(zio_t *zio)
{
arc_hdr_destroy(exists);
exists = buf_hash_insert(hdr, &hash_lock);
ASSERT3P(exists, ==, NULL);
+ } else if (zio->io_flags & ZIO_FLAG_NOPWRITE) {
+ /* nopwrite */
+ ASSERT(zio->io_prop.zp_nopwrite);
+ if (!BP_EQUAL(&zio->io_bp_orig, zio->io_bp))
+ panic("bad nopwrite, hdr=%p exists=%p",
+ (void *)hdr, (void *)exists);
} else {
/* Dedup */
ASSERT(hdr->b_datacnt == 1);
zio_t *
arc_write(zio_t *pio, spa_t *spa, uint64_t txg,
- blkptr_t *bp, arc_buf_t *buf, boolean_t l2arc, const zio_prop_t *zp,
- arc_done_func_t *ready, arc_done_func_t *done, void *private,
- int priority, int zio_flags, const zbookmark_t *zb)
+ blkptr_t *bp, arc_buf_t *buf, boolean_t l2arc, boolean_t l2arc_compress,
+ const zio_prop_t *zp, arc_done_func_t *ready, arc_done_func_t *physdone,
+ arc_done_func_t *done, void *private, zio_priority_t priority,
+ int zio_flags, const zbookmark_t *zb)
{
arc_buf_hdr_t *hdr = buf->b_hdr;
arc_write_callback_t *callback;
ASSERT(hdr->b_acb == NULL);
if (l2arc)
hdr->b_flags |= ARC_L2CACHE;
- callback = kmem_zalloc(sizeof (arc_write_callback_t), KM_SLEEP);
+ if (l2arc_compress)
+ hdr->b_flags |= ARC_L2COMPRESS;
+ callback = kmem_zalloc(sizeof (arc_write_callback_t), KM_PUSHPAGE);
callback->awcb_ready = ready;
+ callback->awcb_physdone = physdone;
callback->awcb_done = done;
callback->awcb_private = private;
callback->awcb_buf = buf;
zio = zio_write(pio, spa, txg, bp, buf->b_data, hdr->b_size, zp,
- arc_write_ready, arc_write_done, callback, priority, zio_flags, zb);
+ arc_write_ready, arc_write_physdone, arc_write_done, callback,
+ priority, zio_flags, zb);
return (zio);
}
static int
-arc_memory_throttle(uint64_t reserve, uint64_t inflight_data, uint64_t txg)
+arc_memory_throttle(uint64_t reserve, uint64_t txg)
{
#ifdef _KERNEL
- uint64_t available_memory;
-
- /* Easily reclaimable memory (free + inactive + arc-evictable) */
- available_memory = ptob(spl_kmem_availrmem()) + arc_evictable_memory();
-#if defined(__i386)
- available_memory =
- MIN(available_memory, vmem_size(heap_arena, VMEM_FREE));
-#endif
+ if (zfs_arc_memory_throttle_disable)
+ return (0);
- if (available_memory <= zfs_write_limit_max) {
+ if (freemem <= physmem * arc_lotsfree_percent / 100) {
ARCSTAT_INCR(arcstat_memory_throttle_count, 1);
DMU_TX_STAT_BUMP(dmu_tx_memory_reclaim);
- return (EAGAIN);
- }
-
- if (inflight_data > available_memory / 4) {
- ARCSTAT_INCR(arcstat_memory_throttle_count, 1);
- DMU_TX_STAT_BUMP(dmu_tx_memory_inflight);
- return (ERESTART);
+ return (SET_ERROR(EAGAIN));
}
#endif
return (0);
int error;
uint64_t anon_size;
-#ifdef ZFS_DEBUG
- /*
- * Once in a while, fail for no reason. Everything should cope.
- */
- if (spa_get_random(10000) == 0) {
- dprintf("forcing random failure\n");
- return (ERESTART);
- }
-#endif
if (reserve > arc_c/4 && !arc_no_grow)
arc_c = MIN(arc_c_max, reserve * 4);
if (reserve > arc_c) {
DMU_TX_STAT_BUMP(dmu_tx_memory_reserve);
- return (ENOMEM);
+ return (SET_ERROR(ENOMEM));
}
/*
/*
* Writes will, almost always, require additional memory allocations
- * in order to compress/encrypt/etc the data. We therefor need to
+ * in order to compress/encrypt/etc the data. We therefore need to
* make sure that there is sufficient available memory for this.
*/
- if ((error = arc_memory_throttle(reserve, anon_size, txg)))
+ error = arc_memory_throttle(reserve, txg);
+ if (error != 0)
return (error);
/*
arc_anon->arcs_lsize[ARC_BUFC_DATA]>>10,
reserve>>10, arc_c>>10);
DMU_TX_STAT_BUMP(dmu_tx_dirty_throttle);
- return (ERESTART);
+ return (SET_ERROR(ERESTART));
}
atomic_add_64(&arc_tempreserve, reserve);
return (0);
arc_stats_t *as = ksp->ks_data;
if (rw == KSTAT_WRITE) {
- return (EACCES);
+ return (SET_ERROR(EACCES));
} else {
arc_kstat_update_state(arc_anon,
&as->arcstat_anon_size,
cv_init(&arc_reclaim_thr_cv, NULL, CV_DEFAULT, NULL);
/* Convert seconds to clock ticks */
- arc_min_prefetch_lifespan = 1 * hz;
+ zfs_arc_min_prefetch_lifespan = 1 * hz;
/* Start out with 1/8 of all memory */
arc_c = physmem * PAGESIZE / 8;
/* set min cache to 1/32 of all memory, or 64MB, whichever is more */
arc_c_min = MAX(arc_c / 4, 64<<20);
- /* set max to 1/2 of all memory, or all but 4GB, whichever is more */
- if (arc_c * 8 >= ((uint64_t)4<<30))
- arc_c_max = (arc_c * 8) - ((uint64_t)4<<30);
- else
- arc_c_max = arc_c_min;
- arc_c_max = MAX(arc_c * 4, arc_c_max);
+ /* set max to 1/2 of all memory */
+ arc_c_max = arc_c * 4;
/*
* Allow the tunables to override our calculations if they are
if (arc_c_min < arc_meta_limit / 2 && zfs_arc_min == 0)
arc_c_min = arc_meta_limit / 2;
- if (zfs_arc_grow_retry > 0)
- arc_grow_retry = zfs_arc_grow_retry;
-
- if (zfs_arc_shrink_shift > 0)
- arc_shrink_shift = zfs_arc_shrink_shift;
-
- if (zfs_arc_p_min_shift > 0)
- arc_p_min_shift = zfs_arc_p_min_shift;
-
- if (zfs_arc_meta_prune > 0)
- arc_meta_prune = zfs_arc_meta_prune;
-
/* if kmem_flags are set, lets try to use less memory */
if (kmem_debugging())
arc_c = arc_c / 2;
list_create(&arc_l2c_only->arcs_list[ARC_BUFC_DATA],
sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
+ arc_anon->arcs_state = ARC_STATE_ANON;
+ arc_mru->arcs_state = ARC_STATE_MRU;
+ arc_mru_ghost->arcs_state = ARC_STATE_MRU_GHOST;
+ arc_mfu->arcs_state = ARC_STATE_MFU;
+ arc_mfu_ghost->arcs_state = ARC_STATE_MFU_GHOST;
+ arc_l2c_only->arcs_state = ARC_STATE_L2C_ONLY;
+
buf_init();
arc_thread_exit = 0;
arc_dead = FALSE;
arc_warm = B_FALSE;
- if (zfs_write_limit_max == 0)
- zfs_write_limit_max = ptob(physmem) >> zfs_write_limit_shift;
- else
- zfs_write_limit_shift = 0;
- mutex_init(&zfs_write_limit_lock, NULL, MUTEX_DEFAULT, NULL);
+ /*
+ * Calculate maximum amount of dirty data per pool.
+ *
+ * If it has been set by a module parameter, take that.
+ * Otherwise, use a percentage of physical memory defined by
+ * zfs_dirty_data_max_percent (default 10%) with a cap at
+ * zfs_dirty_data_max_max (default 25% of physical memory).
+ */
+ if (zfs_dirty_data_max_max == 0)
+ zfs_dirty_data_max_max = physmem * PAGESIZE *
+ zfs_dirty_data_max_max_percent / 100;
+
+ if (zfs_dirty_data_max == 0) {
+ zfs_dirty_data_max = physmem * PAGESIZE *
+ zfs_dirty_data_max_percent / 100;
+ zfs_dirty_data_max = MIN(zfs_dirty_data_max,
+ zfs_dirty_data_max_max);
+ }
}
void
mutex_destroy(&arc_mfu_ghost->arcs_mtx);
mutex_destroy(&arc_l2c_only->arcs_mtx);
- mutex_destroy(&zfs_write_limit_lock);
-
buf_fini();
ASSERT(arc_loaned_bytes == 0);
* 2. The L2ARC attempts to cache data from the ARC before it is evicted.
* It does this by periodically scanning buffers from the eviction-end of
* the MFU and MRU ARC lists, copying them to the L2ARC devices if they are
- * not already there. It scans until a headroom of buffers is satisfied,
- * which itself is a buffer for ARC eviction. The thread that does this is
+ * not already there. It scans until a headroom of buffers is satisfied,
+ * which itself is a buffer for ARC eviction. If a compressible buffer is
+ * found during scanning and selected for writing to an L2ARC device, we
+ * temporarily boost scanning headroom during the next scan cycle to make
+ * sure we adapt to compression effects (which might significantly reduce
+ * the data volume we write to L2ARC). The thread that does this is
* l2arc_feed_thread(), illustrated below; example sizes are included to
* provide a better sense of ratio than this diagram:
*
* l2arc_write_max max write bytes per interval
* l2arc_write_boost extra write bytes during device warmup
* l2arc_noprefetch skip caching prefetched buffers
+ * l2arc_nocompress skip compressing buffers
* l2arc_headroom number of max device writes to precache
+ * l2arc_headroom_boost when we find compressed buffers during ARC
+ * scanning, we multiply headroom by this
+ * percentage factor for the next scan cycle,
+ * since more compressed buffers are likely to
+ * be present
* l2arc_feed_secs seconds between L2ARC writing
*
* Tunables may be removed or added as future performance improvements are
}
static uint64_t
-l2arc_write_size(l2arc_dev_t *dev)
+l2arc_write_size(void)
{
uint64_t size;
- size = dev->l2ad_write;
+ /*
+ * Make sure our globals have meaningful values in case the user
+ * altered them.
+ */
+ size = l2arc_write_max;
+ if (size == 0) {
+ cmn_err(CE_NOTE, "Bad value for l2arc_write_max, value must "
+ "be greater than zero, resetting it to the default (%d)",
+ L2ARC_WRITE_SIZE);
+ size = l2arc_write_max = L2ARC_WRITE_SIZE;
+ }
if (arc_warm == B_FALSE)
- size += dev->l2ad_boost;
+ size += l2arc_write_boost;
return (size);
static void
l2arc_hdr_stat_add(void)
{
- ARCSTAT_INCR(arcstat_l2_hdr_size, HDR_SIZE + L2HDR_SIZE);
+ ARCSTAT_INCR(arcstat_l2_hdr_size, HDR_SIZE);
ARCSTAT_INCR(arcstat_hdr_size, -HDR_SIZE);
}
static void
l2arc_hdr_stat_remove(void)
{
- ARCSTAT_INCR(arcstat_l2_hdr_size, -(HDR_SIZE + L2HDR_SIZE));
+ ARCSTAT_INCR(arcstat_l2_hdr_size, -HDR_SIZE);
ARCSTAT_INCR(arcstat_hdr_size, HDR_SIZE);
}
*/
for (ab = list_prev(buflist, head); ab; ab = ab_prev) {
ab_prev = list_prev(buflist, ab);
+ abl2 = ab->b_l2hdr;
+
+ /*
+ * Release the temporary compressed buffer as soon as possible.
+ */
+ if (abl2->b_compress != ZIO_COMPRESS_OFF)
+ l2arc_release_cdata_buf(ab);
hash_lock = HDR_LOCK(ab);
if (!mutex_tryenter(hash_lock)) {
* Error - drop L2ARC entry.
*/
list_remove(buflist, ab);
- abl2 = ab->b_l2hdr;
+ ARCSTAT_INCR(arcstat_l2_asize, -abl2->b_asize);
ab->b_l2hdr = NULL;
kmem_free(abl2, sizeof (l2arc_buf_hdr_t));
+ arc_space_return(L2HDR_SIZE, ARC_SPACE_L2HDRS);
ARCSTAT_INCR(arcstat_l2_size, -ab->b_size);
}
hdr = buf->b_hdr;
ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
+ /*
+ * If the buffer was compressed, decompress it first.
+ */
+ if (cb->l2rcb_compress != ZIO_COMPRESS_OFF)
+ l2arc_decompress_zio(zio, hdr, cb->l2rcb_compress);
+ ASSERT(zio->io_data != NULL);
+
/*
* Check this survived the L2ARC journey.
*/
if (zio->io_error != 0) {
ARCSTAT_BUMP(arcstat_l2_io_error);
} else {
- zio->io_error = EIO;
+ zio->io_error = SET_ERROR(EIO);
}
if (!equal)
ARCSTAT_BUMP(arcstat_l2_cksum_bad);
*/
if (ab->b_l2hdr != NULL) {
abl2 = ab->b_l2hdr;
+ ARCSTAT_INCR(arcstat_l2_asize, -abl2->b_asize);
ab->b_l2hdr = NULL;
kmem_free(abl2, sizeof (l2arc_buf_hdr_t));
+ arc_space_return(L2HDR_SIZE, ARC_SPACE_L2HDRS);
ARCSTAT_INCR(arcstat_l2_size, -ab->b_size);
}
list_remove(buflist, ab);
*
* An ARC_L2_WRITING flag is set so that the L2ARC buffers are not valid
* for reading until they have completed writing.
+ * The headroom_boost is an in-out parameter used to maintain headroom boost
+ * state between calls to this function.
+ *
+ * Returns the number of bytes actually written (which may be smaller than
+ * the delta by which the device hand has changed due to alignment).
*/
static uint64_t
-l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz)
+l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz,
+ boolean_t *headroom_boost)
{
arc_buf_hdr_t *ab, *ab_prev, *head;
- l2arc_buf_hdr_t *hdrl2;
list_t *list;
- uint64_t passed_sz, write_sz, buf_sz, headroom;
+ uint64_t write_asize, write_psize, write_sz, headroom,
+ buf_compress_minsz;
void *buf_data;
- kmutex_t *hash_lock, *list_lock = NULL;
- boolean_t have_lock, full;
+ kmutex_t *list_lock = NULL;
+ boolean_t full;
l2arc_write_callback_t *cb;
zio_t *pio, *wzio;
- uint64_t guid = spa_guid(spa);
+ uint64_t guid = spa_load_guid(spa);
int try;
+ const boolean_t do_headroom_boost = *headroom_boost;
ASSERT(dev->l2ad_vdev != NULL);
+ /* Lower the flag now, we might want to raise it again later. */
+ *headroom_boost = B_FALSE;
+
pio = NULL;
- write_sz = 0;
+ write_sz = write_asize = write_psize = 0;
full = B_FALSE;
head = kmem_cache_alloc(hdr_cache, KM_PUSHPAGE);
head->b_flags |= ARC_L2_WRITE_HEAD;
+ /*
+ * We will want to try to compress buffers that are at least 2x the
+ * device sector size.
+ */
+ buf_compress_minsz = 2 << dev->l2ad_vdev->vdev_ashift;
+
/*
* Copy buffers for L2ARC writing.
*/
mutex_enter(&l2arc_buflist_mtx);
for (try = 0; try <= 3; try++) {
+ uint64_t passed_sz = 0;
+
list = l2arc_list_locked(try, &list_lock);
- passed_sz = 0;
/*
* L2ARC fast warmup.
* Until the ARC is warm and starts to evict, read from the
* head of the ARC lists rather than the tail.
*/
- headroom = target_sz * l2arc_headroom;
if (arc_warm == B_FALSE)
ab = list_head(list);
else
ab = list_tail(list);
+ headroom = target_sz * l2arc_headroom;
+ if (do_headroom_boost)
+ headroom = (headroom * l2arc_headroom_boost) / 100;
+
for (; ab; ab = ab_prev) {
+ l2arc_buf_hdr_t *l2hdr;
+ kmutex_t *hash_lock;
+ uint64_t buf_sz;
+
if (arc_warm == B_FALSE)
ab_prev = list_next(list, ab);
else
ab_prev = list_prev(list, ab);
hash_lock = HDR_LOCK(ab);
- have_lock = MUTEX_HELD(hash_lock);
- if (!have_lock && !mutex_tryenter(hash_lock)) {
+ if (!mutex_tryenter(hash_lock)) {
/*
* Skip this buffer rather than waiting.
*/
list_insert_head(dev->l2ad_buflist, head);
cb = kmem_alloc(sizeof (l2arc_write_callback_t),
- KM_PUSHPAGE);
+ KM_PUSHPAGE);
cb->l2wcb_dev = dev;
cb->l2wcb_head = head;
pio = zio_root(spa, l2arc_write_done, cb,
/*
* Create and add a new L2ARC header.
*/
- hdrl2 = kmem_zalloc(sizeof (l2arc_buf_hdr_t),
- KM_PUSHPAGE);
- hdrl2->b_dev = dev;
- hdrl2->b_daddr = dev->l2ad_hand;
+ l2hdr = kmem_zalloc(sizeof (l2arc_buf_hdr_t),
+ KM_PUSHPAGE);
+ l2hdr->b_dev = dev;
+ arc_space_consume(L2HDR_SIZE, ARC_SPACE_L2HDRS);
ab->b_flags |= ARC_L2_WRITING;
- ab->b_l2hdr = hdrl2;
- list_insert_head(dev->l2ad_buflist, ab);
- buf_data = ab->b_buf->b_data;
+
+ /*
+ * Temporarily stash the data buffer in b_tmp_cdata.
+ * The subsequent write step will pick it up from
+ * there. This is because can't access ab->b_buf
+ * without holding the hash_lock, which we in turn
+ * can't access without holding the ARC list locks
+ * (which we want to avoid during compression/writing)
+ */
+ l2hdr->b_compress = ZIO_COMPRESS_OFF;
+ l2hdr->b_asize = ab->b_size;
+ l2hdr->b_tmp_cdata = ab->b_buf->b_data;
+ l2hdr->b_hits = 0;
+
buf_sz = ab->b_size;
+ ab->b_l2hdr = l2hdr;
+
+ list_insert_head(dev->l2ad_buflist, ab);
/*
* Compute and store the buffer cksum before
mutex_exit(hash_lock);
+ write_sz += buf_sz;
+ }
+
+ mutex_exit(list_lock);
+
+ if (full == B_TRUE)
+ break;
+ }
+
+ /* No buffers selected for writing? */
+ if (pio == NULL) {
+ ASSERT0(write_sz);
+ mutex_exit(&l2arc_buflist_mtx);
+ kmem_cache_free(hdr_cache, head);
+ return (0);
+ }
+
+ /*
+ * Now start writing the buffers. We're starting at the write head
+ * and work backwards, retracing the course of the buffer selector
+ * loop above.
+ */
+ for (ab = list_prev(dev->l2ad_buflist, head); ab;
+ ab = list_prev(dev->l2ad_buflist, ab)) {
+ l2arc_buf_hdr_t *l2hdr;
+ uint64_t buf_sz;
+
+ /*
+ * We shouldn't need to lock the buffer here, since we flagged
+ * it as ARC_L2_WRITING in the previous step, but we must take
+ * care to only access its L2 cache parameters. In particular,
+ * ab->b_buf may be invalid by now due to ARC eviction.
+ */
+ l2hdr = ab->b_l2hdr;
+ l2hdr->b_daddr = dev->l2ad_hand;
+
+ if (!l2arc_nocompress && (ab->b_flags & ARC_L2COMPRESS) &&
+ l2hdr->b_asize >= buf_compress_minsz) {
+ if (l2arc_compress_buf(l2hdr)) {
+ /*
+ * If compression succeeded, enable headroom
+ * boost on the next scan cycle.
+ */
+ *headroom_boost = B_TRUE;
+ }
+ }
+
+ /*
+ * Pick up the buffer data we had previously stashed away
+ * (and now potentially also compressed).
+ */
+ buf_data = l2hdr->b_tmp_cdata;
+ buf_sz = l2hdr->b_asize;
+
+ /* Compression may have squashed the buffer to zero length. */
+ if (buf_sz != 0) {
+ uint64_t buf_p_sz;
+
wzio = zio_write_phys(pio, dev->l2ad_vdev,
dev->l2ad_hand, buf_sz, buf_data, ZIO_CHECKSUM_OFF,
NULL, NULL, ZIO_PRIORITY_ASYNC_WRITE,
zio_t *, wzio);
(void) zio_nowait(wzio);
+ write_asize += buf_sz;
/*
* Keep the clock hand suitably device-aligned.
*/
- buf_sz = vdev_psize_to_asize(dev->l2ad_vdev, buf_sz);
-
- write_sz += buf_sz;
- dev->l2ad_hand += buf_sz;
+ buf_p_sz = vdev_psize_to_asize(dev->l2ad_vdev, buf_sz);
+ write_psize += buf_p_sz;
+ dev->l2ad_hand += buf_p_sz;
}
-
- mutex_exit(list_lock);
-
- if (full == B_TRUE)
- break;
}
- mutex_exit(&l2arc_buflist_mtx);
- if (pio == NULL) {
- ASSERT3U(write_sz, ==, 0);
- kmem_cache_free(hdr_cache, head);
- return (0);
- }
+ mutex_exit(&l2arc_buflist_mtx);
- ASSERT3U(write_sz, <=, target_sz);
+ ASSERT3U(write_asize, <=, target_sz);
ARCSTAT_BUMP(arcstat_l2_writes_sent);
- ARCSTAT_INCR(arcstat_l2_write_bytes, write_sz);
+ ARCSTAT_INCR(arcstat_l2_write_bytes, write_asize);
ARCSTAT_INCR(arcstat_l2_size, write_sz);
- vdev_space_update(dev->l2ad_vdev, write_sz, 0, 0);
+ ARCSTAT_INCR(arcstat_l2_asize, write_asize);
+ vdev_space_update(dev->l2ad_vdev, write_psize, 0, 0);
/*
* Bump device hand to the device start if it is approaching the end.
(void) zio_wait(pio);
dev->l2ad_writing = B_FALSE;
- return (write_sz);
+ return (write_asize);
+}
+
+/*
+ * Compresses an L2ARC buffer.
+ * The data to be compressed must be prefilled in l2hdr->b_tmp_cdata and its
+ * size in l2hdr->b_asize. This routine tries to compress the data and
+ * depending on the compression result there are three possible outcomes:
+ * *) The buffer was incompressible. The original l2hdr contents were left
+ * untouched and are ready for writing to an L2 device.
+ * *) The buffer was all-zeros, so there is no need to write it to an L2
+ * device. To indicate this situation b_tmp_cdata is NULL'ed, b_asize is
+ * set to zero and b_compress is set to ZIO_COMPRESS_EMPTY.
+ * *) Compression succeeded and b_tmp_cdata was replaced with a temporary
+ * data buffer which holds the compressed data to be written, and b_asize
+ * tells us how much data there is. b_compress is set to the appropriate
+ * compression algorithm. Once writing is done, invoke
+ * l2arc_release_cdata_buf on this l2hdr to free this temporary buffer.
+ *
+ * Returns B_TRUE if compression succeeded, or B_FALSE if it didn't (the
+ * buffer was incompressible).
+ */
+static boolean_t
+l2arc_compress_buf(l2arc_buf_hdr_t *l2hdr)
+{
+ void *cdata;
+ size_t csize, len;
+
+ ASSERT(l2hdr->b_compress == ZIO_COMPRESS_OFF);
+ ASSERT(l2hdr->b_tmp_cdata != NULL);
+
+ len = l2hdr->b_asize;
+ cdata = zio_data_buf_alloc(len);
+ csize = zio_compress_data(ZIO_COMPRESS_LZ4, l2hdr->b_tmp_cdata,
+ cdata, l2hdr->b_asize);
+
+ if (csize == 0) {
+ /* zero block, indicate that there's nothing to write */
+ zio_data_buf_free(cdata, len);
+ l2hdr->b_compress = ZIO_COMPRESS_EMPTY;
+ l2hdr->b_asize = 0;
+ l2hdr->b_tmp_cdata = NULL;
+ ARCSTAT_BUMP(arcstat_l2_compress_zeros);
+ return (B_TRUE);
+ } else if (csize > 0 && csize < len) {
+ /*
+ * Compression succeeded, we'll keep the cdata around for
+ * writing and release it afterwards.
+ */
+ l2hdr->b_compress = ZIO_COMPRESS_LZ4;
+ l2hdr->b_asize = csize;
+ l2hdr->b_tmp_cdata = cdata;
+ ARCSTAT_BUMP(arcstat_l2_compress_successes);
+ return (B_TRUE);
+ } else {
+ /*
+ * Compression failed, release the compressed buffer.
+ * l2hdr will be left unmodified.
+ */
+ zio_data_buf_free(cdata, len);
+ ARCSTAT_BUMP(arcstat_l2_compress_failures);
+ return (B_FALSE);
+ }
+}
+
+/*
+ * Decompresses a zio read back from an l2arc device. On success, the
+ * underlying zio's io_data buffer is overwritten by the uncompressed
+ * version. On decompression error (corrupt compressed stream), the
+ * zio->io_error value is set to signal an I/O error.
+ *
+ * Please note that the compressed data stream is not checksummed, so
+ * if the underlying device is experiencing data corruption, we may feed
+ * corrupt data to the decompressor, so the decompressor needs to be
+ * able to handle this situation (LZ4 does).
+ */
+static void
+l2arc_decompress_zio(zio_t *zio, arc_buf_hdr_t *hdr, enum zio_compress c)
+{
+ uint64_t csize;
+ void *cdata;
+
+ ASSERT(L2ARC_IS_VALID_COMPRESS(c));
+
+ if (zio->io_error != 0) {
+ /*
+ * An io error has occured, just restore the original io
+ * size in preparation for a main pool read.
+ */
+ zio->io_orig_size = zio->io_size = hdr->b_size;
+ return;
+ }
+
+ if (c == ZIO_COMPRESS_EMPTY) {
+ /*
+ * An empty buffer results in a null zio, which means we
+ * need to fill its io_data after we're done restoring the
+ * buffer's contents.
+ */
+ ASSERT(hdr->b_buf != NULL);
+ bzero(hdr->b_buf->b_data, hdr->b_size);
+ zio->io_data = zio->io_orig_data = hdr->b_buf->b_data;
+ } else {
+ ASSERT(zio->io_data != NULL);
+ /*
+ * We copy the compressed data from the start of the arc buffer
+ * (the zio_read will have pulled in only what we need, the
+ * rest is garbage which we will overwrite at decompression)
+ * and then decompress back to the ARC data buffer. This way we
+ * can minimize copying by simply decompressing back over the
+ * original compressed data (rather than decompressing to an
+ * aux buffer and then copying back the uncompressed buffer,
+ * which is likely to be much larger).
+ */
+ csize = zio->io_size;
+ cdata = zio_data_buf_alloc(csize);
+ bcopy(zio->io_data, cdata, csize);
+ if (zio_decompress_data(c, cdata, zio->io_data, csize,
+ hdr->b_size) != 0)
+ zio->io_error = SET_ERROR(EIO);
+ zio_data_buf_free(cdata, csize);
+ }
+
+ /* Restore the expected uncompressed IO size. */
+ zio->io_orig_size = zio->io_size = hdr->b_size;
+}
+
+/*
+ * Releases the temporary b_tmp_cdata buffer in an l2arc header structure.
+ * This buffer serves as a temporary holder of compressed data while
+ * the buffer entry is being written to an l2arc device. Once that is
+ * done, we can dispose of it.
+ */
+static void
+l2arc_release_cdata_buf(arc_buf_hdr_t *ab)
+{
+ l2arc_buf_hdr_t *l2hdr = ab->b_l2hdr;
+
+ if (l2hdr->b_compress == ZIO_COMPRESS_LZ4) {
+ /*
+ * If the data was compressed, then we've allocated a
+ * temporary buffer for it, so now we need to release it.
+ */
+ ASSERT(l2hdr->b_tmp_cdata != NULL);
+ zio_data_buf_free(l2hdr->b_tmp_cdata, ab->b_size);
+ }
+ l2hdr->b_tmp_cdata = NULL;
}
/*
spa_t *spa;
uint64_t size, wrote;
clock_t begin, next = ddi_get_lbolt();
+ boolean_t headroom_boost = B_FALSE;
CALLB_CPR_INIT(&cpr, &l2arc_feed_thr_lock, callb_generic_cpr, FTAG);
ARCSTAT_BUMP(arcstat_l2_feeds);
- size = l2arc_write_size(dev);
+ size = l2arc_write_size();
/*
* Evict L2ARC buffers that will be overwritten.
/*
* Write ARC buffers.
*/
- wrote = l2arc_write_buffers(spa, dev, size);
+ wrote = l2arc_write_buffers(spa, dev, size, &headroom_boost);
/*
* Calculate interval between writes.
adddev = kmem_zalloc(sizeof (l2arc_dev_t), KM_SLEEP);
adddev->l2ad_spa = spa;
adddev->l2ad_vdev = vd;
- adddev->l2ad_write = l2arc_write_max;
- adddev->l2ad_boost = l2arc_write_boost;
adddev->l2ad_start = VDEV_LABEL_START_SIZE;
adddev->l2ad_end = VDEV_LABEL_START_SIZE + vdev_get_min_asize(vd);
adddev->l2ad_hand = adddev->l2ad_start;
adddev->l2ad_first = B_TRUE;
adddev->l2ad_writing = B_FALSE;
list_link_init(&adddev->l2ad_node);
- ASSERT3U(adddev->l2ad_write, >, 0);
/*
* This is a list of all ARC buffers that are still valid on the
#if defined(_KERNEL) && defined(HAVE_SPL)
EXPORT_SYMBOL(arc_read);
EXPORT_SYMBOL(arc_buf_remove_ref);
+EXPORT_SYMBOL(arc_buf_info);
EXPORT_SYMBOL(arc_getbuf_func);
EXPORT_SYMBOL(arc_add_prune_callback);
EXPORT_SYMBOL(arc_remove_prune_callback);
-module_param(zfs_arc_min, ulong, 0444);
+module_param(zfs_arc_min, ulong, 0644);
MODULE_PARM_DESC(zfs_arc_min, "Min arc size");
-module_param(zfs_arc_max, ulong, 0444);
+module_param(zfs_arc_max, ulong, 0644);
MODULE_PARM_DESC(zfs_arc_max, "Max arc size");
-module_param(zfs_arc_meta_limit, ulong, 0444);
+module_param(zfs_arc_meta_limit, ulong, 0644);
MODULE_PARM_DESC(zfs_arc_meta_limit, "Meta limit for arc size");
-module_param(zfs_arc_meta_prune, int, 0444);
+module_param(zfs_arc_meta_prune, int, 0644);
MODULE_PARM_DESC(zfs_arc_meta_prune, "Bytes of meta data to prune");
-module_param(zfs_arc_grow_retry, int, 0444);
+module_param(zfs_arc_grow_retry, int, 0644);
MODULE_PARM_DESC(zfs_arc_grow_retry, "Seconds before growing arc size");
-module_param(zfs_arc_shrink_shift, int, 0444);
+module_param(zfs_arc_shrink_shift, int, 0644);
MODULE_PARM_DESC(zfs_arc_shrink_shift, "log2(fraction of arc to reclaim)");
-module_param(zfs_arc_p_min_shift, int, 0444);
+module_param(zfs_arc_p_min_shift, int, 0644);
MODULE_PARM_DESC(zfs_arc_p_min_shift, "arc_c shift to calc min/max arc_p");
-module_param(l2arc_write_max, ulong, 0444);
+module_param(zfs_disable_dup_eviction, int, 0644);
+MODULE_PARM_DESC(zfs_disable_dup_eviction, "disable duplicate buffer eviction");
+
+module_param(zfs_arc_memory_throttle_disable, int, 0644);
+MODULE_PARM_DESC(zfs_arc_memory_throttle_disable, "disable memory throttle");
+
+module_param(zfs_arc_min_prefetch_lifespan, int, 0644);
+MODULE_PARM_DESC(zfs_arc_min_prefetch_lifespan, "Min life of prefetch block");
+
+module_param(l2arc_write_max, ulong, 0644);
MODULE_PARM_DESC(l2arc_write_max, "Max write bytes per interval");
-module_param(l2arc_write_boost, ulong, 0444);
+module_param(l2arc_write_boost, ulong, 0644);
MODULE_PARM_DESC(l2arc_write_boost, "Extra write bytes during device warmup");
-module_param(l2arc_headroom, ulong, 0444);
+module_param(l2arc_headroom, ulong, 0644);
MODULE_PARM_DESC(l2arc_headroom, "Number of max device writes to precache");
-module_param(l2arc_feed_secs, ulong, 0444);
+module_param(l2arc_headroom_boost, ulong, 0644);
+MODULE_PARM_DESC(l2arc_headroom_boost, "Compressed l2arc_headroom multiplier");
+
+module_param(l2arc_feed_secs, ulong, 0644);
MODULE_PARM_DESC(l2arc_feed_secs, "Seconds between L2ARC writing");
-module_param(l2arc_feed_min_ms, ulong, 0444);
+module_param(l2arc_feed_min_ms, ulong, 0644);
MODULE_PARM_DESC(l2arc_feed_min_ms, "Min feed interval in milliseconds");
-module_param(l2arc_noprefetch, int, 0444);
+module_param(l2arc_noprefetch, int, 0644);
MODULE_PARM_DESC(l2arc_noprefetch, "Skip caching prefetched buffers");
-module_param(l2arc_feed_again, int, 0444);
+module_param(l2arc_nocompress, int, 0644);
+MODULE_PARM_DESC(l2arc_nocompress, "Skip compressing L2ARC buffers");
+
+module_param(l2arc_feed_again, int, 0644);
MODULE_PARM_DESC(l2arc_feed_again, "Turbo L2ARC warmup");
-module_param(l2arc_norw, int, 0444);
+module_param(l2arc_norw, int, 0644);
MODULE_PARM_DESC(l2arc_norw, "No reads during writes");
#endif