* CDDL HEADER END
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
*/
/*
#include <sys/spa.h>
#include <sys/zio.h>
-#include <sys/zio_checksum.h>
#include <sys/zfs_context.h>
#include <sys/arc.h>
#include <sys/refcount.h>
#include <sys/vdev.h>
+#include <sys/vdev_impl.h>
#ifdef _KERNEL
#include <sys/vmsystm.h>
#include <vm/anon.h>
#endif
#include <sys/callb.h>
#include <sys/kstat.h>
+#include <zfs_fletcher.h>
static kmutex_t arc_reclaim_thr_lock;
static kcondvar_t arc_reclaim_thr_cv; /* used to signal reclaim thr */
/* number of seconds before growing cache again */
static int arc_grow_retry = 60;
+/* shift of arc_c for calculating both min and max arc_p */
+static int arc_p_min_shift = 4;
+
+/* log2(fraction of arc to reclaim) */
+static int arc_shrink_shift = 5;
+
/*
* minimum lifespan of a prefetch block in clock ticks
* (initialized in arc_init())
/*
* These tunables are for performance analysis.
*/
-uint64_t zfs_arc_max;
-uint64_t zfs_arc_min;
-uint64_t zfs_arc_meta_limit = 0;
-int zfs_mdcomp_disable = 0;
+unsigned long zfs_arc_max = 0;
+unsigned long zfs_arc_min = 0;
+unsigned long zfs_arc_meta_limit = 0;
+int zfs_arc_grow_retry = 0;
+int zfs_arc_shrink_shift = 0;
+int zfs_arc_p_min_shift = 0;
/*
* Note that buffers can be in one of 6 states:
kstat_named_t arcstat_recycle_miss;
kstat_named_t arcstat_mutex_miss;
kstat_named_t arcstat_evict_skip;
+ kstat_named_t arcstat_evict_l2_cached;
+ kstat_named_t arcstat_evict_l2_eligible;
+ kstat_named_t arcstat_evict_l2_ineligible;
kstat_named_t arcstat_hash_elements;
kstat_named_t arcstat_hash_elements_max;
kstat_named_t arcstat_hash_collisions;
kstat_named_t arcstat_c_max;
kstat_named_t arcstat_size;
kstat_named_t arcstat_hdr_size;
+ kstat_named_t arcstat_data_size;
+ kstat_named_t arcstat_other_size;
kstat_named_t arcstat_l2_hits;
kstat_named_t arcstat_l2_misses;
kstat_named_t arcstat_l2_feeds;
kstat_named_t arcstat_l2_rw_clash;
+ kstat_named_t arcstat_l2_read_bytes;
+ kstat_named_t arcstat_l2_write_bytes;
kstat_named_t arcstat_l2_writes_sent;
kstat_named_t arcstat_l2_writes_done;
kstat_named_t arcstat_l2_writes_error;
{ "recycle_miss", KSTAT_DATA_UINT64 },
{ "mutex_miss", KSTAT_DATA_UINT64 },
{ "evict_skip", KSTAT_DATA_UINT64 },
+ { "evict_l2_cached", KSTAT_DATA_UINT64 },
+ { "evict_l2_eligible", KSTAT_DATA_UINT64 },
+ { "evict_l2_ineligible", KSTAT_DATA_UINT64 },
{ "hash_elements", KSTAT_DATA_UINT64 },
{ "hash_elements_max", KSTAT_DATA_UINT64 },
{ "hash_collisions", KSTAT_DATA_UINT64 },
{ "c_max", KSTAT_DATA_UINT64 },
{ "size", KSTAT_DATA_UINT64 },
{ "hdr_size", KSTAT_DATA_UINT64 },
+ { "data_size", KSTAT_DATA_UINT64 },
+ { "other_size", KSTAT_DATA_UINT64 },
{ "l2_hits", KSTAT_DATA_UINT64 },
{ "l2_misses", KSTAT_DATA_UINT64 },
{ "l2_feeds", KSTAT_DATA_UINT64 },
{ "l2_rw_clash", KSTAT_DATA_UINT64 },
+ { "l2_read_bytes", KSTAT_DATA_UINT64 },
+ { "l2_write_bytes", KSTAT_DATA_UINT64 },
{ "l2_writes_sent", KSTAT_DATA_UINT64 },
{ "l2_writes_done", KSTAT_DATA_UINT64 },
{ "l2_writes_error", KSTAT_DATA_UINT64 },
#define ARCSTAT_INCR(stat, val) \
atomic_add_64(&arc_stats.stat.value.ui64, (val));
-#define ARCSTAT_BUMP(stat) ARCSTAT_INCR(stat, 1)
+#define ARCSTAT_BUMP(stat) ARCSTAT_INCR(stat, 1)
#define ARCSTAT_BUMPDOWN(stat) ARCSTAT_INCR(stat, -1)
#define ARCSTAT_MAX(stat, val) { \
}
kstat_t *arc_ksp;
-static arc_state_t *arc_anon;
+static arc_state_t *arc_anon;
static arc_state_t *arc_mru;
static arc_state_t *arc_mru_ghost;
static arc_state_t *arc_mfu;
static int arc_no_grow; /* Don't try to grow cache size */
static uint64_t arc_tempreserve;
+static uint64_t arc_loaned_bytes;
static uint64_t arc_meta_used;
static uint64_t arc_meta_limit;
static uint64_t arc_meta_max = 0;
kmutex_t b_freeze_lock;
zio_cksum_t *b_freeze_cksum;
+ void *b_thawed;
arc_buf_hdr_t *b_hash_next;
arc_buf_t *b_buf;
/* immutable */
arc_buf_contents_t b_type;
uint64_t b_size;
- spa_t *b_spa;
+ uint64_t b_spa;
/* protected by arc state mutex */
arc_state_t *b_state;
static void arc_get_data_buf(arc_buf_t *buf);
static void arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock);
static int arc_evict_needed(arc_buf_contents_t type);
-static void arc_evict_ghost(arc_state_t *state, spa_t *spa, int64_t bytes);
+static void arc_evict_ghost(arc_state_t *state, uint64_t spa, int64_t bytes);
+
+static boolean_t l2arc_write_eligible(uint64_t spa_guid, arc_buf_hdr_t *ab);
#define GHOST_STATE(state) \
((state) == arc_mru_ghost || (state) == arc_mfu_ghost || \
#define ARC_L2_WRITING (1 << 16) /* L2ARC write in progress */
#define ARC_L2_EVICTED (1 << 17) /* evicted during I/O */
#define ARC_L2_WRITE_HEAD (1 << 18) /* head of write list */
-#define ARC_STORED (1 << 19) /* has been store()d to */
#define HDR_IN_HASH_TABLE(hdr) ((hdr)->b_flags & ARC_IN_HASH_TABLE)
#define HDR_IO_IN_PROGRESS(hdr) ((hdr)->b_flags & ARC_IO_IN_PROGRESS)
#define HDR_IO_ERROR(hdr) ((hdr)->b_flags & ARC_IO_ERROR)
+#define HDR_PREFETCH(hdr) ((hdr)->b_flags & ARC_PREFETCH)
#define HDR_FREED_IN_READ(hdr) ((hdr)->b_flags & ARC_FREED_IN_READ)
#define HDR_BUF_AVAILABLE(hdr) ((hdr)->b_flags & ARC_BUF_AVAILABLE)
#define HDR_FREE_IN_PROGRESS(hdr) ((hdr)->b_flags & ARC_FREE_IN_PROGRESS)
* Hash table routines
*/
-#define HT_LOCK_PAD 64
+#define HT_LOCK_ALIGN 64
+#define HT_LOCK_PAD (P2NPHASE(sizeof (kmutex_t), (HT_LOCK_ALIGN)))
struct ht_lock {
kmutex_t ht_lock;
#ifdef _KERNEL
- unsigned char pad[(HT_LOCK_PAD - sizeof (kmutex_t))];
+ unsigned char pad[HT_LOCK_PAD];
#endif
};
(buf_hash(spa, dva, birth) & buf_hash_table.ht_mask)
#define BUF_HASH_LOCK_NTRY(idx) (buf_hash_table.ht_locks[idx & (BUF_LOCKS-1)])
#define BUF_HASH_LOCK(idx) (&(BUF_HASH_LOCK_NTRY(idx).ht_lock))
-#define HDR_LOCK(buf) \
- (BUF_HASH_LOCK(BUF_HASH_INDEX(buf->b_spa, &buf->b_dva, buf->b_birth)))
+#define HDR_LOCK(hdr) \
+ (BUF_HASH_LOCK(BUF_HASH_INDEX(hdr->b_spa, &hdr->b_dva, hdr->b_birth)))
uint64_t zfs_crc64_table[256];
*/
#define L2ARC_WRITE_SIZE (8 * 1024 * 1024) /* initial write max */
-#define L2ARC_HEADROOM 4 /* num of writes */
-#define L2ARC_FEED_SECS 1 /* caching interval */
+#define L2ARC_HEADROOM 2 /* num of writes */
+#define L2ARC_FEED_SECS 1 /* caching interval secs */
+#define L2ARC_FEED_MIN_MS 200 /* min caching interval ms */
#define l2arc_writes_sent ARCSTAT(arcstat_l2_writes_sent)
#define l2arc_writes_done ARCSTAT(arcstat_l2_writes_done)
uint64_t l2arc_write_boost = L2ARC_WRITE_SIZE; /* extra write during warmup */
uint64_t l2arc_headroom = L2ARC_HEADROOM; /* number of dev writes */
uint64_t l2arc_feed_secs = L2ARC_FEED_SECS; /* interval seconds */
+uint64_t l2arc_feed_min_ms = L2ARC_FEED_MIN_MS; /* min interval milliseconds */
boolean_t l2arc_noprefetch = B_TRUE; /* don't cache prefetch bufs */
+boolean_t l2arc_feed_again = B_TRUE; /* turbo warmup */
+boolean_t l2arc_norw = B_TRUE; /* no reads during writes */
/*
* L2ARC Internals
uint64_t l2ad_end; /* last addr on device */
uint64_t l2ad_evict; /* last addr eviction reached */
boolean_t l2ad_first; /* first sweep through */
+ boolean_t l2ad_writing; /* currently writing */
list_t *l2ad_buflist; /* buffer list */
list_node_t l2ad_node; /* device list node */
} l2arc_dev_t;
struct l2arc_buf_hdr {
/* protected by arc_buf_hdr mutex */
l2arc_dev_t *b_dev; /* L2ARC device */
- daddr_t b_daddr; /* disk address, offset byte */
+ uint64_t b_daddr; /* disk address, offset byte */
};
typedef struct l2arc_data_free {
static void l2arc_hdr_stat_remove(void);
static uint64_t
-buf_hash(spa_t *spa, const dva_t *dva, uint64_t birth)
+buf_hash(uint64_t spa, const dva_t *dva, uint64_t birth)
{
- uintptr_t spav = (uintptr_t)spa;
uint8_t *vdva = (uint8_t *)dva;
uint64_t crc = -1ULL;
int i;
for (i = 0; i < sizeof (dva_t); i++)
crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ vdva[i]) & 0xFF];
- crc ^= (spav>>8) ^ birth;
+ crc ^= (spa>>8) ^ birth;
return (crc);
}
((buf)->b_dva.dva_word[1] == (dva)->dva_word[1]) && \
((buf)->b_birth == birth) && ((buf)->b_spa == spa)
+static void
+buf_discard_identity(arc_buf_hdr_t *hdr)
+{
+ hdr->b_dva.dva_word[0] = 0;
+ hdr->b_dva.dva_word[1] = 0;
+ hdr->b_birth = 0;
+ hdr->b_cksum0 = 0;
+}
+
static arc_buf_hdr_t *
-buf_hash_find(spa_t *spa, const dva_t *dva, uint64_t birth, kmutex_t **lockp)
+buf_hash_find(uint64_t spa, const dva_t *dva, uint64_t birth, kmutex_t **lockp)
{
uint64_t idx = BUF_HASH_INDEX(spa, dva, birth);
kmutex_t *hash_lock = BUF_HASH_LOCK(idx);
{
int i;
+#if defined(_KERNEL) && defined(HAVE_SPL)
+ /* Large allocations which do not require contiguous pages
+ * should be using vmem_free() in the linux kernel */
+ vmem_free(buf_hash_table.ht_table,
+ (buf_hash_table.ht_mask + 1) * sizeof (void *));
+#else
kmem_free(buf_hash_table.ht_table,
(buf_hash_table.ht_mask + 1) * sizeof (void *));
+#endif
for (i = 0; i < BUF_LOCKS; i++)
mutex_destroy(&buf_hash_table.ht_locks[i].ht_lock);
kmem_cache_destroy(hdr_cache);
refcount_create(&buf->b_refcnt);
cv_init(&buf->b_cv, NULL, CV_DEFAULT, NULL);
mutex_init(&buf->b_freeze_lock, NULL, MUTEX_DEFAULT, NULL);
+ list_link_init(&buf->b_arc_node);
+ list_link_init(&buf->b_l2node);
+ arc_space_consume(sizeof (arc_buf_hdr_t), ARC_SPACE_HDRS);
- ARCSTAT_INCR(arcstat_hdr_size, HDR_SIZE);
return (0);
}
arc_buf_t *buf = vbuf;
bzero(buf, sizeof (arc_buf_t));
- rw_init(&buf->b_lock, NULL, RW_DEFAULT, NULL);
+ mutex_init(&buf->b_evict_lock, NULL, MUTEX_DEFAULT, NULL);
+ rw_init(&buf->b_data_lock, NULL, RW_DEFAULT, NULL);
+ arc_space_consume(sizeof (arc_buf_t), ARC_SPACE_HDRS);
+
return (0);
}
{
arc_buf_hdr_t *buf = vbuf;
+ ASSERT(BUF_EMPTY(buf));
refcount_destroy(&buf->b_refcnt);
cv_destroy(&buf->b_cv);
mutex_destroy(&buf->b_freeze_lock);
-
- ARCSTAT_INCR(arcstat_hdr_size, -HDR_SIZE);
+ arc_space_return(sizeof (arc_buf_hdr_t), ARC_SPACE_HDRS);
}
/* ARGSUSED */
{
arc_buf_t *buf = vbuf;
- rw_destroy(&buf->b_lock);
+ mutex_destroy(&buf->b_evict_lock);
+ rw_destroy(&buf->b_data_lock);
+ arc_space_return(sizeof (arc_buf_t), ARC_SPACE_HDRS);
}
/*
hsize <<= 1;
retry:
buf_hash_table.ht_mask = hsize - 1;
+#if defined(_KERNEL) && defined(HAVE_SPL)
+ /* Large allocations which do not require contiguous pages
+ * should be using vmem_alloc() in the linux kernel */
+ buf_hash_table.ht_table =
+ vmem_zalloc(hsize * sizeof (void*), KM_SLEEP);
+#else
buf_hash_table.ht_table =
kmem_zalloc(hsize * sizeof (void*), KM_NOSLEEP);
+#endif
if (buf_hash_table.ht_table == NULL) {
ASSERT(hsize > (1ULL << 8));
hsize >>= 1;
kmem_free(buf->b_hdr->b_freeze_cksum, sizeof (zio_cksum_t));
buf->b_hdr->b_freeze_cksum = NULL;
}
+
+ if (zfs_flags & ZFS_DEBUG_MODIFY) {
+ if (buf->b_hdr->b_thawed)
+ kmem_free(buf->b_hdr->b_thawed, 1);
+ buf->b_hdr->b_thawed = kmem_alloc(1, KM_SLEEP);
+ }
+
mutex_exit(&buf->b_hdr->b_freeze_lock);
}
void
arc_buf_freeze(arc_buf_t *buf)
{
+ kmutex_t *hash_lock;
+
if (!(zfs_flags & ZFS_DEBUG_MODIFY))
return;
+ hash_lock = HDR_LOCK(buf->b_hdr);
+ mutex_enter(hash_lock);
+
ASSERT(buf->b_hdr->b_freeze_cksum != NULL ||
buf->b_hdr->b_state == arc_anon);
arc_cksum_compute(buf, B_FALSE);
+ mutex_exit(hash_lock);
}
static void
ASSERT(new_state != old_state);
ASSERT(refcnt == 0 || ab->b_datacnt > 0);
ASSERT(ab->b_datacnt == 0 || !GHOST_STATE(new_state));
+ ASSERT(ab->b_datacnt <= 1 || old_state != arc_anon);
from_delta = to_delta = ab->b_datacnt * ab->b_size;
/*
* If prefetching out of the ghost cache,
- * we will have a non-null datacnt.
+ * we will have a non-zero datacnt.
*/
if (GHOST_STATE(old_state) && ab->b_datacnt == 0) {
/* ghost elements have a ghost size */
}
ASSERT(!BUF_EMPTY(ab));
- if (new_state == arc_anon) {
+ if (new_state == arc_anon && HDR_IN_HASH_TABLE(ab))
buf_hash_remove(ab);
- }
/* adjust state sizes */
if (to_delta)
}
void
-arc_space_consume(uint64_t space)
+arc_space_consume(uint64_t space, arc_space_type_t type)
{
+ ASSERT(type >= 0 && type < ARC_SPACE_NUMTYPES);
+
+ switch (type) {
+ default:
+ break;
+ case ARC_SPACE_DATA:
+ ARCSTAT_INCR(arcstat_data_size, space);
+ break;
+ case ARC_SPACE_OTHER:
+ ARCSTAT_INCR(arcstat_other_size, space);
+ break;
+ case ARC_SPACE_HDRS:
+ ARCSTAT_INCR(arcstat_hdr_size, space);
+ break;
+ case ARC_SPACE_L2HDRS:
+ ARCSTAT_INCR(arcstat_l2_hdr_size, space);
+ break;
+ }
+
atomic_add_64(&arc_meta_used, space);
atomic_add_64(&arc_size, space);
}
void
-arc_space_return(uint64_t space)
+arc_space_return(uint64_t space, arc_space_type_t type)
{
+ ASSERT(type >= 0 && type < ARC_SPACE_NUMTYPES);
+
+ switch (type) {
+ default:
+ break;
+ case ARC_SPACE_DATA:
+ ARCSTAT_INCR(arcstat_data_size, -space);
+ break;
+ case ARC_SPACE_OTHER:
+ ARCSTAT_INCR(arcstat_other_size, -space);
+ break;
+ case ARC_SPACE_HDRS:
+ ARCSTAT_INCR(arcstat_hdr_size, -space);
+ break;
+ case ARC_SPACE_L2HDRS:
+ ARCSTAT_INCR(arcstat_l2_hdr_size, -space);
+ break;
+ }
+
ASSERT(arc_meta_used >= space);
if (arc_meta_max < arc_meta_used)
arc_meta_max = arc_meta_used;
ASSERT(BUF_EMPTY(hdr));
hdr->b_size = size;
hdr->b_type = type;
- hdr->b_spa = spa;
+ hdr->b_spa = spa_guid(spa);
hdr->b_state = arc_anon;
hdr->b_arc_access = 0;
buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE);
return (buf);
}
+static char *arc_onloan_tag = "onloan";
+
+/*
+ * Loan out an anonymous arc buffer. Loaned buffers are not counted as in
+ * flight data by arc_tempreserve_space() until they are "returned". Loaned
+ * buffers must be returned to the arc before they can be used by the DMU or
+ * freed.
+ */
+arc_buf_t *
+arc_loan_buf(spa_t *spa, int size)
+{
+ arc_buf_t *buf;
+
+ buf = arc_buf_alloc(spa, size, arc_onloan_tag, ARC_BUFC_DATA);
+
+ atomic_add_64(&arc_loaned_bytes, size);
+ return (buf);
+}
+
+/*
+ * Return a loaned arc buffer to the arc.
+ */
+void
+arc_return_buf(arc_buf_t *buf, void *tag)
+{
+ arc_buf_hdr_t *hdr = buf->b_hdr;
+
+ ASSERT(buf->b_data != NULL);
+ (void) refcount_add(&hdr->b_refcnt, tag);
+ (void) refcount_remove(&hdr->b_refcnt, arc_onloan_tag);
+
+ atomic_add_64(&arc_loaned_bytes, -hdr->b_size);
+}
+
+/* Detach an arc_buf from a dbuf (tag) */
+void
+arc_loan_inuse_buf(arc_buf_t *buf, void *tag)
+{
+ arc_buf_hdr_t *hdr;
+
+ ASSERT(buf->b_data != NULL);
+ hdr = buf->b_hdr;
+ (void) refcount_add(&hdr->b_refcnt, arc_onloan_tag);
+ (void) refcount_remove(&hdr->b_refcnt, tag);
+ buf->b_efunc = NULL;
+ buf->b_private = NULL;
+
+ atomic_add_64(&arc_loaned_bytes, hdr->b_size);
+}
+
static arc_buf_t *
arc_buf_clone(arc_buf_t *from)
{
arc_buf_hdr_t *hdr = from->b_hdr;
uint64_t size = hdr->b_size;
+ ASSERT(hdr->b_state != arc_anon);
+
buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE);
buf->b_hdr = hdr;
buf->b_data = NULL;
* must verify b_data != NULL to know if the add_ref
* was successful.
*/
- rw_enter(&buf->b_lock, RW_READER);
+ mutex_enter(&buf->b_evict_lock);
if (buf->b_data == NULL) {
- rw_exit(&buf->b_lock);
+ mutex_exit(&buf->b_evict_lock);
return;
}
- hdr = buf->b_hdr;
- ASSERT(hdr != NULL);
- hash_lock = HDR_LOCK(hdr);
+ hash_lock = HDR_LOCK(buf->b_hdr);
mutex_enter(hash_lock);
- rw_exit(&buf->b_lock);
+ hdr = buf->b_hdr;
+ ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
+ mutex_exit(&buf->b_evict_lock);
ASSERT(hdr->b_state == arc_mru || hdr->b_state == arc_mfu);
add_reference(hdr, hash_lock, tag);
+ DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr);
arc_access(hdr, hash_lock);
mutex_exit(hash_lock);
ARCSTAT_BUMP(arcstat_hits);
arc_buf_contents_t type = buf->b_hdr->b_type;
arc_cksum_verify(buf);
+
if (!recycle) {
if (type == ARC_BUFC_METADATA) {
arc_buf_data_free(buf->b_hdr, zio_buf_free,
buf->b_data, size);
- arc_space_return(size);
+ arc_space_return(size, ARC_SPACE_DATA);
} else {
ASSERT(type == ARC_BUFC_DATA);
arc_buf_data_free(buf->b_hdr,
zio_data_buf_free, buf->b_data, size);
+ ARCSTAT_INCR(arcstat_data_size, -size);
atomic_add_64(&arc_size, -size);
}
}
for (bufp = &buf->b_hdr->b_buf; *bufp != buf; bufp = &(*bufp)->b_next)
continue;
*bufp = buf->b_next;
+ buf->b_next = NULL;
ASSERT(buf->b_efunc == NULL);
static void
arc_hdr_destroy(arc_buf_hdr_t *hdr)
{
+ l2arc_buf_hdr_t *l2hdr = hdr->b_l2hdr;
+
ASSERT(refcount_is_zero(&hdr->b_refcnt));
ASSERT3P(hdr->b_state, ==, arc_anon);
ASSERT(!HDR_IO_IN_PROGRESS(hdr));
- ASSERT(!(hdr->b_flags & ARC_STORED));
- if (hdr->b_l2hdr != NULL) {
- if (!MUTEX_HELD(&l2arc_buflist_mtx)) {
- /*
- * To prevent arc_free() and l2arc_evict() from
- * attempting to free the same buffer at the same time,
- * a FREE_IN_PROGRESS flag is given to arc_free() to
- * give it priority. l2arc_evict() can't destroy this
- * header while we are waiting on l2arc_buflist_mtx.
- *
- * The hdr may be removed from l2ad_buflist before we
- * grab l2arc_buflist_mtx, so b_l2hdr is rechecked.
- */
+ if (l2hdr != NULL) {
+ boolean_t buflist_held = MUTEX_HELD(&l2arc_buflist_mtx);
+ /*
+ * To prevent arc_free() and l2arc_evict() from
+ * attempting to free the same buffer at the same time,
+ * a FREE_IN_PROGRESS flag is given to arc_free() to
+ * give it priority. l2arc_evict() can't destroy this
+ * header while we are waiting on l2arc_buflist_mtx.
+ *
+ * The hdr may be removed from l2ad_buflist before we
+ * grab l2arc_buflist_mtx, so b_l2hdr is rechecked.
+ */
+ if (!buflist_held) {
mutex_enter(&l2arc_buflist_mtx);
- if (hdr->b_l2hdr != NULL) {
- list_remove(hdr->b_l2hdr->b_dev->l2ad_buflist,
- hdr);
- }
- mutex_exit(&l2arc_buflist_mtx);
- } else {
- list_remove(hdr->b_l2hdr->b_dev->l2ad_buflist, hdr);
+ l2hdr = hdr->b_l2hdr;
}
- ARCSTAT_INCR(arcstat_l2_size, -hdr->b_size);
- kmem_free(hdr->b_l2hdr, sizeof (l2arc_buf_hdr_t));
- if (hdr->b_state == arc_l2c_only)
- l2arc_hdr_stat_remove();
- hdr->b_l2hdr = NULL;
+
+ if (l2hdr != NULL) {
+ list_remove(l2hdr->b_dev->l2ad_buflist, hdr);
+ ARCSTAT_INCR(arcstat_l2_size, -hdr->b_size);
+ kmem_free(l2hdr, sizeof (l2arc_buf_hdr_t));
+ if (hdr->b_state == arc_l2c_only)
+ l2arc_hdr_stat_remove();
+ hdr->b_l2hdr = NULL;
+ }
+
+ if (!buflist_held)
+ mutex_exit(&l2arc_buflist_mtx);
}
if (!BUF_EMPTY(hdr)) {
ASSERT(!HDR_IN_HASH_TABLE(hdr));
- bzero(&hdr->b_dva, sizeof (dva_t));
- hdr->b_birth = 0;
- hdr->b_cksum0 = 0;
+ buf_discard_identity(hdr);
}
while (hdr->b_buf) {
arc_buf_t *buf = hdr->b_buf;
if (buf->b_efunc) {
mutex_enter(&arc_eviction_mtx);
- rw_enter(&buf->b_lock, RW_WRITER);
+ mutex_enter(&buf->b_evict_lock);
ASSERT(buf->b_hdr != NULL);
arc_buf_destroy(hdr->b_buf, FALSE, FALSE);
hdr->b_buf = buf->b_next;
buf->b_hdr = &arc_eviction_hdr;
buf->b_next = arc_eviction_list;
arc_eviction_list = buf;
- rw_exit(&buf->b_lock);
+ mutex_exit(&buf->b_evict_lock);
mutex_exit(&arc_eviction_mtx);
} else {
arc_buf_destroy(hdr->b_buf, FALSE, TRUE);
kmem_free(hdr->b_freeze_cksum, sizeof (zio_cksum_t));
hdr->b_freeze_cksum = NULL;
}
+ if (hdr->b_thawed) {
+ kmem_free(hdr->b_thawed, 1);
+ hdr->b_thawed = NULL;
+ }
ASSERT(!list_link_active(&hdr->b_arc_node));
ASSERT3P(hdr->b_hash_next, ==, NULL);
kmutex_t *hash_lock = HDR_LOCK(hdr);
mutex_enter(hash_lock);
+ hdr = buf->b_hdr;
+ ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
+
(void) remove_reference(hdr, hash_lock, tag);
- if (hdr->b_datacnt > 1)
+ if (hdr->b_datacnt > 1) {
arc_buf_destroy(buf, FALSE, TRUE);
- else
+ } else {
+ ASSERT(buf == hdr->b_buf);
+ ASSERT(buf->b_efunc == NULL);
hdr->b_flags |= ARC_BUF_AVAILABLE;
+ }
mutex_exit(hash_lock);
} else if (HDR_IO_IN_PROGRESS(hdr)) {
int destroy_hdr;
if (destroy_hdr)
arc_hdr_destroy(hdr);
} else {
- if (remove_reference(hdr, NULL, tag) > 0) {
- ASSERT(HDR_IO_ERROR(hdr));
+ if (remove_reference(hdr, NULL, tag) > 0)
arc_buf_destroy(buf, FALSE, TRUE);
- } else {
+ else
arc_hdr_destroy(hdr);
- }
}
}
int no_callback = (buf->b_efunc == NULL);
if (hdr->b_state == arc_anon) {
+ ASSERT(hdr->b_datacnt == 1);
arc_buf_free(buf, tag);
return (no_callback);
}
mutex_enter(hash_lock);
+ hdr = buf->b_hdr;
+ ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
ASSERT(hdr->b_state != arc_anon);
ASSERT(buf->b_data != NULL);
arc_buf_destroy(buf, FALSE, TRUE);
} else if (no_callback) {
ASSERT(hdr->b_buf == buf && buf->b_next == NULL);
+ ASSERT(buf->b_efunc == NULL);
hdr->b_flags |= ARC_BUF_AVAILABLE;
}
ASSERT(no_callback || hdr->b_datacnt > 1 ||
* It may also return without evicting as much space as requested.
*/
static void *
-arc_evict(arc_state_t *state, spa_t *spa, int64_t bytes, boolean_t recycle,
+arc_evict(arc_state_t *state, uint64_t spa, int64_t bytes, boolean_t recycle,
arc_buf_contents_t type)
{
arc_state_t *evicted_state;
if (HDR_IO_IN_PROGRESS(ab) ||
(spa && ab->b_spa != spa) ||
(ab->b_flags & (ARC_PREFETCH|ARC_INDIRECT) &&
- lbolt - ab->b_arc_access < arc_min_prefetch_lifespan)) {
+ ddi_get_lbolt() - ab->b_arc_access <
+ arc_min_prefetch_lifespan)) {
skipped++;
continue;
}
ASSERT(ab->b_datacnt > 0);
while (ab->b_buf) {
arc_buf_t *buf = ab->b_buf;
- if (!rw_tryenter(&buf->b_lock, RW_WRITER)) {
+ if (!mutex_tryenter(&buf->b_evict_lock)) {
missed += 1;
break;
}
buf->b_next = arc_eviction_list;
arc_eviction_list = buf;
mutex_exit(&arc_eviction_mtx);
- rw_exit(&buf->b_lock);
+ mutex_exit(&buf->b_evict_lock);
} else {
- rw_exit(&buf->b_lock);
+ mutex_exit(&buf->b_evict_lock);
arc_buf_destroy(buf,
buf->b_data == stolen, TRUE);
}
}
+
+ if (ab->b_l2hdr) {
+ ARCSTAT_INCR(arcstat_evict_l2_cached,
+ ab->b_size);
+ } else {
+ if (l2arc_write_eligible(ab->b_spa, ab)) {
+ ARCSTAT_INCR(arcstat_evict_l2_eligible,
+ ab->b_size);
+ } else {
+ ARCSTAT_INCR(
+ arcstat_evict_l2_ineligible,
+ ab->b_size);
+ }
+ }
+
if (ab->b_datacnt == 0) {
arc_change_state(evicted_state, ab, hash_lock);
ASSERT(HDR_IN_HASH_TABLE(ab));
mutex_exit(&state->arcs_mtx);
if (bytes_evicted < bytes)
- dprintf("only evicted %lld bytes from %x",
+ dprintf("only evicted %lld bytes from %x\n",
(longlong_t)bytes_evicted, state);
if (skipped)
if (mru_over > 0 && arc_mru_ghost->arcs_lsize[type] > 0) {
int64_t todelete =
MIN(arc_mru_ghost->arcs_lsize[type], mru_over);
- arc_evict_ghost(arc_mru_ghost, NULL, todelete);
+ arc_evict_ghost(arc_mru_ghost, 0, todelete);
} else if (arc_mfu_ghost->arcs_lsize[type] > 0) {
int64_t todelete = MIN(arc_mfu_ghost->arcs_lsize[type],
arc_mru_ghost->arcs_size +
arc_mfu_ghost->arcs_size - arc_c);
- arc_evict_ghost(arc_mfu_ghost, NULL, todelete);
+ arc_evict_ghost(arc_mfu_ghost, 0, todelete);
}
}
* bytes. Destroy the buffers that are removed.
*/
static void
-arc_evict_ghost(arc_state_t *state, spa_t *spa, int64_t bytes)
+arc_evict_ghost(arc_state_t *state, uint64_t spa, int64_t bytes)
{
arc_buf_hdr_t *ab, *ab_prev;
+ arc_buf_hdr_t marker;
list_t *list = &state->arcs_list[ARC_BUFC_DATA];
kmutex_t *hash_lock;
uint64_t bytes_deleted = 0;
uint64_t bufs_skipped = 0;
ASSERT(GHOST_STATE(state));
+ bzero(&marker, sizeof(marker));
top:
mutex_enter(&state->arcs_mtx);
for (ab = list_tail(list); ab; ab = ab_prev) {
ab_prev = list_prev(list, ab);
if (spa && ab->b_spa != spa)
continue;
+
+ /* ignore markers */
+ if (ab->b_spa == 0)
+ continue;
+
hash_lock = HDR_LOCK(ab);
+ /* caller may be trying to modify this buffer, skip it */
+ if (MUTEX_HELD(hash_lock))
+ continue;
if (mutex_tryenter(hash_lock)) {
ASSERT(!HDR_IO_IN_PROGRESS(ab));
ASSERT(ab->b_buf == NULL);
DTRACE_PROBE1(arc__delete, arc_buf_hdr_t *, ab);
if (bytes >= 0 && bytes_deleted >= bytes)
break;
- } else {
- if (bytes < 0) {
- mutex_exit(&state->arcs_mtx);
- mutex_enter(hash_lock);
- mutex_exit(hash_lock);
- goto top;
- }
+ } else if (bytes < 0) {
+ /*
+ * Insert a list marker and then wait for the
+ * hash lock to become available. Once its
+ * available, restart from where we left off.
+ */
+ list_insert_after(list, ab, &marker);
+ mutex_exit(&state->arcs_mtx);
+ mutex_enter(hash_lock);
+ mutex_exit(hash_lock);
+ mutex_enter(&state->arcs_mtx);
+ ab_prev = list_prev(list, &marker);
+ list_remove(list, &marker);
+ } else
bufs_skipped += 1;
- }
}
mutex_exit(&state->arcs_mtx);
}
if (bytes_deleted < bytes)
- dprintf("only deleted %lld bytes from %p",
+ dprintf("only deleted %lld bytes from %p\n",
(longlong_t)bytes_deleted, state);
}
static void
arc_adjust(void)
{
- int64_t top_sz, mru_over, arc_over, todelete;
+ int64_t adjustment, delta;
- top_sz = arc_anon->arcs_size + arc_mru->arcs_size + arc_meta_used;
+ /*
+ * Adjust MRU size
+ */
+
+ adjustment = MIN((int64_t)(arc_size - arc_c),
+ (int64_t)(arc_anon->arcs_size + arc_mru->arcs_size + arc_meta_used -
+ arc_p));
- if (top_sz > arc_p && arc_mru->arcs_lsize[ARC_BUFC_DATA] > 0) {
- int64_t toevict =
- MIN(arc_mru->arcs_lsize[ARC_BUFC_DATA], top_sz - arc_p);
- (void) arc_evict(arc_mru, NULL, toevict, FALSE, ARC_BUFC_DATA);
- top_sz = arc_anon->arcs_size + arc_mru->arcs_size;
+ if (adjustment > 0 && arc_mru->arcs_lsize[ARC_BUFC_DATA] > 0) {
+ delta = MIN(arc_mru->arcs_lsize[ARC_BUFC_DATA], adjustment);
+ (void) arc_evict(arc_mru, 0, delta, FALSE, ARC_BUFC_DATA);
+ adjustment -= delta;
}
- if (top_sz > arc_p && arc_mru->arcs_lsize[ARC_BUFC_METADATA] > 0) {
- int64_t toevict =
- MIN(arc_mru->arcs_lsize[ARC_BUFC_METADATA], top_sz - arc_p);
- (void) arc_evict(arc_mru, NULL, toevict, FALSE,
+ if (adjustment > 0 && arc_mru->arcs_lsize[ARC_BUFC_METADATA] > 0) {
+ delta = MIN(arc_mru->arcs_lsize[ARC_BUFC_METADATA], adjustment);
+ (void) arc_evict(arc_mru, 0, delta, FALSE,
ARC_BUFC_METADATA);
- top_sz = arc_anon->arcs_size + arc_mru->arcs_size;
}
- mru_over = top_sz + arc_mru_ghost->arcs_size - arc_c;
+ /*
+ * Adjust MFU size
+ */
- if (mru_over > 0) {
- if (arc_mru_ghost->arcs_size > 0) {
- todelete = MIN(arc_mru_ghost->arcs_size, mru_over);
- arc_evict_ghost(arc_mru_ghost, NULL, todelete);
- }
+ adjustment = arc_size - arc_c;
+
+ if (adjustment > 0 && arc_mfu->arcs_lsize[ARC_BUFC_DATA] > 0) {
+ delta = MIN(adjustment, arc_mfu->arcs_lsize[ARC_BUFC_DATA]);
+ (void) arc_evict(arc_mfu, 0, delta, FALSE, ARC_BUFC_DATA);
+ adjustment -= delta;
}
- if ((arc_over = arc_size - arc_c) > 0) {
- int64_t tbl_over;
+ if (adjustment > 0 && arc_mfu->arcs_lsize[ARC_BUFC_METADATA] > 0) {
+ int64_t delta = MIN(adjustment,
+ arc_mfu->arcs_lsize[ARC_BUFC_METADATA]);
+ (void) arc_evict(arc_mfu, 0, delta, FALSE,
+ ARC_BUFC_METADATA);
+ }
- if (arc_mfu->arcs_lsize[ARC_BUFC_DATA] > 0) {
- int64_t toevict =
- MIN(arc_mfu->arcs_lsize[ARC_BUFC_DATA], arc_over);
- (void) arc_evict(arc_mfu, NULL, toevict, FALSE,
- ARC_BUFC_DATA);
- arc_over = arc_size - arc_c;
- }
+ /*
+ * Adjust ghost lists
+ */
- if (arc_over > 0 &&
- arc_mfu->arcs_lsize[ARC_BUFC_METADATA] > 0) {
- int64_t toevict =
- MIN(arc_mfu->arcs_lsize[ARC_BUFC_METADATA],
- arc_over);
- (void) arc_evict(arc_mfu, NULL, toevict, FALSE,
- ARC_BUFC_METADATA);
- }
+ adjustment = arc_mru->arcs_size + arc_mru_ghost->arcs_size - arc_c;
- tbl_over = arc_size + arc_mru_ghost->arcs_size +
- arc_mfu_ghost->arcs_size - arc_c * 2;
+ if (adjustment > 0 && arc_mru_ghost->arcs_size > 0) {
+ delta = MIN(arc_mru_ghost->arcs_size, adjustment);
+ arc_evict_ghost(arc_mru_ghost, 0, delta);
+ }
- if (tbl_over > 0 && arc_mfu_ghost->arcs_size > 0) {
- todelete = MIN(arc_mfu_ghost->arcs_size, tbl_over);
- arc_evict_ghost(arc_mfu_ghost, NULL, todelete);
- }
+ adjustment =
+ arc_mru_ghost->arcs_size + arc_mfu_ghost->arcs_size - arc_c;
+
+ if (adjustment > 0 && arc_mfu_ghost->arcs_size > 0) {
+ delta = MIN(arc_mfu_ghost->arcs_size, adjustment);
+ arc_evict_ghost(arc_mfu_ghost, 0, delta);
}
}
while (arc_eviction_list != NULL) {
arc_buf_t *buf = arc_eviction_list;
arc_eviction_list = buf->b_next;
- rw_enter(&buf->b_lock, RW_WRITER);
+ mutex_enter(&buf->b_evict_lock);
buf->b_hdr = NULL;
- rw_exit(&buf->b_lock);
+ mutex_exit(&buf->b_evict_lock);
mutex_exit(&arc_eviction_mtx);
if (buf->b_efunc != NULL)
void
arc_flush(spa_t *spa)
{
+ uint64_t guid = 0;
+
+ if (spa)
+ guid = spa_guid(spa);
+
while (list_head(&arc_mru->arcs_list[ARC_BUFC_DATA])) {
- (void) arc_evict(arc_mru, spa, -1, FALSE, ARC_BUFC_DATA);
+ (void) arc_evict(arc_mru, guid, -1, FALSE, ARC_BUFC_DATA);
if (spa)
break;
}
while (list_head(&arc_mru->arcs_list[ARC_BUFC_METADATA])) {
- (void) arc_evict(arc_mru, spa, -1, FALSE, ARC_BUFC_METADATA);
+ (void) arc_evict(arc_mru, guid, -1, FALSE, ARC_BUFC_METADATA);
if (spa)
break;
}
while (list_head(&arc_mfu->arcs_list[ARC_BUFC_DATA])) {
- (void) arc_evict(arc_mfu, spa, -1, FALSE, ARC_BUFC_DATA);
+ (void) arc_evict(arc_mfu, guid, -1, FALSE, ARC_BUFC_DATA);
if (spa)
break;
}
while (list_head(&arc_mfu->arcs_list[ARC_BUFC_METADATA])) {
- (void) arc_evict(arc_mfu, spa, -1, FALSE, ARC_BUFC_METADATA);
+ (void) arc_evict(arc_mfu, guid, -1, FALSE, ARC_BUFC_METADATA);
if (spa)
break;
}
- arc_evict_ghost(arc_mru_ghost, spa, -1);
- arc_evict_ghost(arc_mfu_ghost, spa, -1);
+ arc_evict_ghost(arc_mru_ghost, guid, -1);
+ arc_evict_ghost(arc_mfu_ghost, guid, -1);
mutex_enter(&arc_reclaim_thr_lock);
arc_do_user_evicts();
ASSERT(spa || arc_eviction_list == NULL);
}
-int arc_shrink_shift = 5; /* log2(fraction of arc to reclaim) */
-
void
arc_shrink(void)
{
static int
arc_reclaim_needed(void)
{
- uint64_t extra;
-
#ifdef _KERNEL
+ uint64_t extra;
if (needfree)
return (1);
}
/* reset the growth delay for every reclaim */
- growtime = lbolt + (arc_grow_retry * hz);
+ growtime = ddi_get_lbolt() + (arc_grow_retry * hz);
arc_kmem_reap_now(last_reclaim);
arc_warm = B_TRUE;
- } else if (arc_no_grow && lbolt >= growtime) {
+ } else if (arc_no_grow && ddi_get_lbolt() >= growtime) {
arc_no_grow = FALSE;
}
- if (2 * arc_c < arc_size +
- arc_mru_ghost->arcs_size + arc_mfu_ghost->arcs_size)
- arc_adjust();
+ arc_adjust();
if (arc_eviction_list != NULL)
arc_do_user_evicts();
/* block until needed, or one second, whichever is shorter */
CALLB_CPR_SAFE_BEGIN(&cpr);
(void) cv_timedwait(&arc_reclaim_thr_cv,
- &arc_reclaim_thr_lock, (lbolt + hz));
+ &arc_reclaim_thr_lock, (ddi_get_lbolt() + hz));
CALLB_CPR_SAFE_END(&cpr, &arc_reclaim_thr_lock);
}
arc_adapt(int bytes, arc_state_t *state)
{
int mult;
+ uint64_t arc_p_min = (arc_c >> arc_p_min_shift);
if (state == arc_l2c_only)
return;
if (state == arc_mru_ghost) {
mult = ((arc_mru_ghost->arcs_size >= arc_mfu_ghost->arcs_size) ?
1 : (arc_mfu_ghost->arcs_size/arc_mru_ghost->arcs_size));
+ mult = MIN(mult, 10); /* avoid wild arc_p adjustment */
- arc_p = MIN(arc_c, arc_p + bytes * mult);
+ arc_p = MIN(arc_c - arc_p_min, arc_p + bytes * mult);
} else if (state == arc_mfu_ghost) {
+ uint64_t delta;
+
mult = ((arc_mfu_ghost->arcs_size >= arc_mru_ghost->arcs_size) ?
1 : (arc_mru_ghost->arcs_size/arc_mfu_ghost->arcs_size));
+ mult = MIN(mult, 10);
- arc_p = MAX(0, (int64_t)arc_p - bytes * mult);
+ delta = MIN(bytes * mult, arc_p);
+ arc_p = MAX(arc_p_min, arc_p - delta);
}
ASSERT((int64_t)arc_p >= 0);
if (!arc_evict_needed(type)) {
if (type == ARC_BUFC_METADATA) {
buf->b_data = zio_buf_alloc(size);
- arc_space_consume(size);
+ arc_space_consume(size, ARC_SPACE_DATA);
} else {
ASSERT(type == ARC_BUFC_DATA);
buf->b_data = zio_data_buf_alloc(size);
+ ARCSTAT_INCR(arcstat_data_size, size);
atomic_add_64(&arc_size, size);
}
goto out;
if (state == arc_mru || state == arc_anon) {
uint64_t mru_used = arc_anon->arcs_size + arc_mru->arcs_size;
- state = (arc_mfu->arcs_lsize[type] > 0 &&
+ state = (arc_mfu->arcs_lsize[type] >= size &&
arc_p > mru_used) ? arc_mfu : arc_mru;
} else {
/* MFU cases */
uint64_t mfu_space = arc_c - arc_p;
- state = (arc_mru->arcs_lsize[type] > 0 &&
+ state = (arc_mru->arcs_lsize[type] >= size &&
mfu_space > arc_mfu->arcs_size) ? arc_mru : arc_mfu;
}
- if ((buf->b_data = arc_evict(state, NULL, size, TRUE, type)) == NULL) {
+ if ((buf->b_data = arc_evict(state, 0, size, TRUE, type)) == NULL) {
if (type == ARC_BUFC_METADATA) {
buf->b_data = zio_buf_alloc(size);
- arc_space_consume(size);
+ arc_space_consume(size, ARC_SPACE_DATA);
} else {
ASSERT(type == ARC_BUFC_DATA);
buf->b_data = zio_data_buf_alloc(size);
+ ARCSTAT_INCR(arcstat_data_size, size);
atomic_add_64(&arc_size, size);
}
ARCSTAT_BUMP(arcstat_recycle_miss);
static void
arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock)
{
+ clock_t now;
+
ASSERT(MUTEX_HELD(hash_lock));
if (buf->b_state == arc_anon) {
*/
ASSERT(buf->b_arc_access == 0);
- buf->b_arc_access = lbolt;
+ buf->b_arc_access = ddi_get_lbolt();
DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, buf);
arc_change_state(arc_mru, buf, hash_lock);
} else if (buf->b_state == arc_mru) {
+ now = ddi_get_lbolt();
+
/*
* If this buffer is here because of a prefetch, then either:
* - clear the flag if this is a "referencing" read
buf->b_flags &= ~ARC_PREFETCH;
ARCSTAT_BUMP(arcstat_mru_hits);
}
- buf->b_arc_access = lbolt;
+ buf->b_arc_access = now;
return;
}
* but it is still in the cache. Move it to the MFU
* state.
*/
- if (lbolt > buf->b_arc_access + ARC_MINTIME) {
+ if (now > buf->b_arc_access + ARC_MINTIME) {
/*
* More than 125ms have passed since we
* instantiated this buffer. Move it to the
* most frequently used state.
*/
- buf->b_arc_access = lbolt;
+ buf->b_arc_access = now;
DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf);
arc_change_state(arc_mfu, buf, hash_lock);
}
DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf);
}
- buf->b_arc_access = lbolt;
+ buf->b_arc_access = ddi_get_lbolt();
arc_change_state(new_state, buf, hash_lock);
ARCSTAT_BUMP(arcstat_mru_ghost_hits);
ASSERT(list_link_active(&buf->b_arc_node));
}
ARCSTAT_BUMP(arcstat_mfu_hits);
- buf->b_arc_access = lbolt;
+ buf->b_arc_access = ddi_get_lbolt();
} else if (buf->b_state == arc_mfu_ghost) {
arc_state_t *new_state = arc_mfu;
/*
new_state = arc_mru;
}
- buf->b_arc_access = lbolt;
+ buf->b_arc_access = ddi_get_lbolt();
DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf);
arc_change_state(new_state, buf, hash_lock);
* This buffer is on the 2nd Level ARC.
*/
- buf->b_arc_access = lbolt;
+ buf->b_arc_access = ddi_get_lbolt();
DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf);
arc_change_state(arc_mfu, buf, hash_lock);
} else {
void
arc_bcopy_func(zio_t *zio, arc_buf_t *buf, void *arg)
{
- bcopy(buf->b_data, arg, buf->b_hdr->b_size);
+ if (zio == NULL || zio->io_error == 0)
+ bcopy(buf->b_data, arg, buf->b_hdr->b_size);
VERIFY(arc_buf_remove_ref(buf, arg) == 1);
}
*bufp = NULL;
} else {
*bufp = buf;
+ ASSERT(buf->b_data);
}
}
* reason for it not to be found is if we were freed during the
* read.
*/
- found = buf_hash_find(zio->io_spa, &hdr->b_dva, hdr->b_birth,
+ found = buf_hash_find(hdr->b_spa, &hdr->b_dva, hdr->b_birth,
&hash_lock);
ASSERT((found == NULL && HDR_FREED_IN_READ(hdr) && hash_lock == NULL) ||
/* byteswap if necessary */
callback_list = hdr->b_acb;
ASSERT(callback_list != NULL);
- if (BP_SHOULD_BYTESWAP(zio->io_bp)) {
+ if (BP_SHOULD_BYTESWAP(zio->io_bp) && zio->io_error == 0) {
arc_byteswap_func_t *func = BP_GET_LEVEL(zio->io_bp) > 0 ?
byteswap_uint64_array :
dmu_ot[BP_GET_TYPE(zio->io_bp)].ot_byteswap;
arc_cksum_compute(buf, B_FALSE);
+ if (hash_lock && zio->io_error == 0 && hdr->b_state == arc_anon) {
+ /*
+ * Only call arc_access on anonymous buffers. This is because
+ * if we've issued an I/O for an evicted buffer, we've already
+ * called arc_access (to prevent any simultaneous readers from
+ * getting confused).
+ */
+ arc_access(hdr, hash_lock);
+ }
+
/* create copies of the data buffer for the callers */
abuf = buf;
for (acb = callback_list; acb; acb = acb->acb_next) {
hdr->b_acb = NULL;
hdr->b_flags &= ~ARC_IO_IN_PROGRESS;
ASSERT(!HDR_BUF_AVAILABLE(hdr));
- if (abuf == buf)
+ if (abuf == buf) {
+ ASSERT(buf->b_efunc == NULL);
+ ASSERT(hdr->b_datacnt == 1);
hdr->b_flags |= ARC_BUF_AVAILABLE;
+ }
ASSERT(refcount_is_zero(&hdr->b_refcnt) || callback_list != NULL);
cv_broadcast(&hdr->b_cv);
if (hash_lock) {
- /*
- * Only call arc_access on anonymous buffers. This is because
- * if we've issued an I/O for an evicted buffer, we've already
- * called arc_access (to prevent any simultaneous readers from
- * getting confused).
- */
- if (zio->io_error == 0 && hdr->b_state == arc_anon)
- arc_access(hdr, hash_lock);
mutex_exit(hash_lock);
} else {
/*
* arc_read_bp.
*/
int
-arc_read(zio_t *pio, spa_t *spa, blkptr_t *bp, arc_buf_t *pbuf,
+arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_buf_t *pbuf,
arc_done_func_t *done, void *private, int priority, int zio_flags,
uint32_t *arc_flags, const zbookmark_t *zb)
{
int err;
- arc_buf_hdr_t *hdr = pbuf->b_hdr;
+
+ if (pbuf == NULL) {
+ /*
+ * XXX This happens from traverse callback funcs, for
+ * the objset_phys_t block.
+ */
+ return (arc_read_nolock(pio, spa, bp, done, private, priority,
+ zio_flags, arc_flags, zb));
+ }
ASSERT(!refcount_is_zero(&pbuf->b_hdr->b_refcnt));
ASSERT3U((char *)bp - (char *)pbuf->b_data, <, pbuf->b_hdr->b_size);
- rw_enter(&pbuf->b_lock, RW_READER);
+ rw_enter(&pbuf->b_data_lock, RW_READER);
err = arc_read_nolock(pio, spa, bp, done, private, priority,
zio_flags, arc_flags, zb);
+ rw_exit(&pbuf->b_data_lock);
- ASSERT3P(hdr, ==, pbuf->b_hdr);
- rw_exit(&pbuf->b_lock);
return (err);
}
int
-arc_read_nolock(zio_t *pio, spa_t *spa, blkptr_t *bp,
+arc_read_nolock(zio_t *pio, spa_t *spa, const blkptr_t *bp,
arc_done_func_t *done, void *private, int priority, int zio_flags,
uint32_t *arc_flags, const zbookmark_t *zb)
{
arc_buf_hdr_t *hdr;
- arc_buf_t *buf;
+ arc_buf_t *buf = NULL;
kmutex_t *hash_lock;
zio_t *rzio;
+ uint64_t guid = spa_guid(spa);
top:
- hdr = buf_hash_find(spa, BP_IDENTITY(bp), bp->blk_birth, &hash_lock);
+ hdr = buf_hash_find(guid, BP_IDENTITY(bp), BP_PHYSICAL_BIRTH(bp),
+ &hash_lock);
if (hdr && hdr->b_datacnt > 0) {
*arc_flags |= ARC_CACHED;
acb->acb_private = private;
if (pio != NULL)
acb->acb_zio_dummy = zio_null(pio,
- spa, NULL, NULL, zio_flags);
+ spa, NULL, NULL, NULL, zio_flags);
ASSERT(acb->acb_done != NULL);
acb->acb_next = hdr->b_acb;
} else {
buf = arc_buf_clone(buf);
}
+
} else if (*arc_flags & ARC_PREFETCH &&
refcount_count(&hdr->b_refcnt) == 0) {
hdr->b_flags |= ARC_PREFETCH;
uint64_t size = BP_GET_LSIZE(bp);
arc_callback_t *acb;
vdev_t *vd = NULL;
- daddr_t addr;
+ daddr_t addr = -1;
+ boolean_t devw = B_FALSE;
if (hdr == NULL) {
/* this block is not in the cache */
buf = arc_buf_alloc(spa, size, private, type);
hdr = buf->b_hdr;
hdr->b_dva = *BP_IDENTITY(bp);
- hdr->b_birth = bp->blk_birth;
+ hdr->b_birth = BP_PHYSICAL_BIRTH(bp);
hdr->b_cksum0 = bp->blk_cksum.zc_word[0];
exists = buf_hash_insert(hdr, &hash_lock);
if (exists) {
/* somebody beat us to the hash insert */
mutex_exit(hash_lock);
- bzero(&hdr->b_dva, sizeof (dva_t));
- hdr->b_birth = 0;
- hdr->b_cksum0 = 0;
+ buf_discard_identity(hdr);
(void) arc_buf_remove_ref(buf, private);
goto top; /* restart the IO request */
}
buf->b_private = NULL;
buf->b_next = NULL;
hdr->b_buf = buf;
- arc_get_data_buf(buf);
ASSERT(hdr->b_datacnt == 0);
hdr->b_datacnt = 1;
-
+ arc_get_data_buf(buf);
+ arc_access(hdr, hash_lock);
}
+ ASSERT(!GHOST_STATE(hdr->b_state));
+
acb = kmem_zalloc(sizeof (arc_callback_t), KM_SLEEP);
acb->acb_done = done;
acb->acb_private = private;
hdr->b_acb = acb;
hdr->b_flags |= ARC_IO_IN_PROGRESS;
- /*
- * If the buffer has been evicted, migrate it to a present state
- * before issuing the I/O. Once we drop the hash-table lock,
- * the header will be marked as I/O in progress and have an
- * attached buffer. At this point, anybody who finds this
- * buffer ought to notice that it's legit but has a pending I/O.
- */
-
- if (GHOST_STATE(hdr->b_state))
- arc_access(hdr, hash_lock);
-
if (HDR_L2CACHE(hdr) && hdr->b_l2hdr != NULL &&
(vd = hdr->b_l2hdr->b_dev->l2ad_vdev) != NULL) {
+ devw = hdr->b_l2hdr->b_dev->l2ad_writing;
addr = hdr->b_l2hdr->b_daddr;
/*
* Lock out device removal.
mutex_exit(hash_lock);
ASSERT3U(hdr->b_size, ==, size);
- DTRACE_PROBE3(arc__miss, blkptr_t *, bp, uint64_t, size,
- zbookmark_t *, zb);
+ DTRACE_PROBE4(arc__miss, arc_buf_hdr_t *, hdr, blkptr_t *, bp,
+ uint64_t, size, zbookmark_t *, zb);
ARCSTAT_BUMP(arcstat_misses);
ARCSTAT_CONDSTAT(!(hdr->b_flags & ARC_PREFETCH),
demand, prefetch, hdr->b_type != ARC_BUFC_METADATA,
data, metadata, misses);
- if (vd != NULL) {
+ if (vd != NULL && l2arc_ndev != 0 && !(l2arc_norw && devw)) {
/*
* Read from the L2ARC if the following are true:
* 1. The L2ARC vdev was previously cached.
* 3. This buffer isn't currently writing to the L2ARC.
* 4. The L2ARC entry wasn't evicted, which may
* also have invalidated the vdev.
+ * 5. This isn't prefetch and l2arc_noprefetch is set.
*/
if (hdr->b_l2hdr != NULL &&
- !HDR_L2_WRITING(hdr) && !HDR_L2_EVICTED(hdr)) {
+ !HDR_L2_WRITING(hdr) && !HDR_L2_EVICTED(hdr) &&
+ !(l2arc_noprefetch && HDR_PREFETCH(hdr))) {
l2arc_read_callback_t *cb;
DTRACE_PROBE1(l2arc__hit, arc_buf_hdr_t *, hdr);
ZIO_FLAG_DONT_RETRY, B_FALSE);
DTRACE_PROBE2(l2arc__read, vdev_t *, vd,
zio_t *, rzio);
+ ARCSTAT_INCR(arcstat_l2_read_bytes, size);
if (*arc_flags & ARC_NOWAIT) {
zio_nowait(rzio);
ARCSTAT_BUMP(arcstat_l2_rw_clash);
spa_config_exit(spa, SCL_L2ARC, vd);
}
+ } else {
+ if (vd != NULL)
+ spa_config_exit(spa, SCL_L2ARC, vd);
+ if (l2arc_ndev != 0) {
+ DTRACE_PROBE1(l2arc__miss,
+ arc_buf_hdr_t *, hdr);
+ ARCSTAT_BUMP(arcstat_l2_misses);
+ }
}
rzio = zio_read(pio, spa, bp, buf->b_data, size,
return (0);
}
-/*
- * arc_read() variant to support pool traversal. If the block is already
- * in the ARC, make a copy of it; otherwise, the caller will do the I/O.
- * The idea is that we don't want pool traversal filling up memory, but
- * if the ARC already has the data anyway, we shouldn't pay for the I/O.
- */
-int
-arc_tryread(spa_t *spa, blkptr_t *bp, void *data)
-{
- arc_buf_hdr_t *hdr;
- kmutex_t *hash_mtx;
- int rc = 0;
-
- hdr = buf_hash_find(spa, BP_IDENTITY(bp), bp->blk_birth, &hash_mtx);
-
- if (hdr && hdr->b_datacnt > 0 && !HDR_IO_IN_PROGRESS(hdr)) {
- arc_buf_t *buf = hdr->b_buf;
-
- ASSERT(buf);
- while (buf->b_data == NULL) {
- buf = buf->b_next;
- ASSERT(buf);
- }
- bcopy(buf->b_data, data, hdr->b_size);
- } else {
- rc = ENOENT;
- }
-
- if (hash_mtx)
- mutex_exit(hash_mtx);
-
- return (rc);
-}
-
void
arc_set_callback(arc_buf_t *buf, arc_evict_func_t *func, void *private)
{
ASSERT(buf->b_hdr != NULL);
ASSERT(buf->b_hdr->b_state != arc_anon);
ASSERT(!refcount_is_zero(&buf->b_hdr->b_refcnt) || func == NULL);
+ ASSERT(buf->b_efunc == NULL);
+ ASSERT(!HDR_BUF_AVAILABLE(buf->b_hdr));
+
buf->b_efunc = func;
buf->b_private = private;
}
kmutex_t *hash_lock;
arc_buf_t **bufp;
- rw_enter(&buf->b_lock, RW_WRITER);
+ mutex_enter(&buf->b_evict_lock);
hdr = buf->b_hdr;
if (hdr == NULL) {
/*
* We are in arc_do_user_evicts().
*/
ASSERT(buf->b_data == NULL);
- rw_exit(&buf->b_lock);
+ mutex_exit(&buf->b_evict_lock);
return (0);
} else if (buf->b_data == NULL) {
arc_buf_t copy = *buf; /* structure assignment */
* but let arc_do_user_evicts() do the reaping.
*/
buf->b_efunc = NULL;
- rw_exit(&buf->b_lock);
+ mutex_exit(&buf->b_evict_lock);
VERIFY(copy.b_efunc(©) == 0);
return (1);
}
hash_lock = HDR_LOCK(hdr);
mutex_enter(hash_lock);
+ hdr = buf->b_hdr;
+ ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
- ASSERT(buf->b_hdr == hdr);
ASSERT3U(refcount_count(&hdr->b_refcnt), <, hdr->b_datacnt);
ASSERT(hdr->b_state == arc_mru || hdr->b_state == arc_mfu);
arc_state_t *old_state = hdr->b_state;
arc_state_t *evicted_state;
+ ASSERT(hdr->b_buf == NULL);
ASSERT(refcount_is_zero(&hdr->b_refcnt));
evicted_state =
mutex_exit(&old_state->arcs_mtx);
}
mutex_exit(hash_lock);
- rw_exit(&buf->b_lock);
+ mutex_exit(&buf->b_evict_lock);
VERIFY(buf->b_efunc(buf) == 0);
buf->b_efunc = NULL;
buf->b_private = NULL;
buf->b_hdr = NULL;
+ buf->b_next = NULL;
kmem_cache_free(buf_cache, buf);
return (1);
}
arc_release(arc_buf_t *buf, void *tag)
{
arc_buf_hdr_t *hdr;
- kmutex_t *hash_lock;
+ kmutex_t *hash_lock = NULL;
l2arc_buf_hdr_t *l2hdr;
- uint64_t buf_size;
+ uint64_t buf_size = 0;
+
+ /*
+ * It would be nice to assert that if it's DMU metadata (level >
+ * 0 || it's the dnode file), then it must be syncing context.
+ * But we don't know that information at this level.
+ */
- rw_enter(&buf->b_lock, RW_WRITER);
+ mutex_enter(&buf->b_evict_lock);
hdr = buf->b_hdr;
/* this buffer is not on any list */
ASSERT(refcount_count(&hdr->b_refcnt) > 0);
- ASSERT(!(hdr->b_flags & ARC_STORED));
if (hdr->b_state == arc_anon) {
/* this buffer is already released */
- ASSERT3U(refcount_count(&hdr->b_refcnt), ==, 1);
- ASSERT(BUF_EMPTY(hdr));
ASSERT(buf->b_efunc == NULL);
- arc_buf_thaw(buf);
- rw_exit(&buf->b_lock);
- return;
+ } else {
+ hash_lock = HDR_LOCK(hdr);
+ mutex_enter(hash_lock);
+ hdr = buf->b_hdr;
+ ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
}
- hash_lock = HDR_LOCK(hdr);
- mutex_enter(hash_lock);
-
l2hdr = hdr->b_l2hdr;
if (l2hdr) {
mutex_enter(&l2arc_buflist_mtx);
arc_buf_hdr_t *nhdr;
arc_buf_t **bufp;
uint64_t blksz = hdr->b_size;
- spa_t *spa = hdr->b_spa;
+ uint64_t spa = hdr->b_spa;
arc_buf_contents_t type = hdr->b_type;
uint32_t flags = hdr->b_flags;
ASSERT(hdr->b_buf != buf || buf->b_next != NULL);
/*
- * Pull the data off of this buf and attach it to
- * a new anonymous buf.
+ * Pull the data off of this hdr and attach it to
+ * a new anonymous hdr.
*/
(void) remove_reference(hdr, hash_lock, tag);
bufp = &hdr->b_buf;
while (*bufp != buf)
bufp = &(*bufp)->b_next;
- *bufp = (*bufp)->b_next;
+ *bufp = buf->b_next;
buf->b_next = NULL;
ASSERT3U(hdr->b_state->arcs_size, >=, hdr->b_size);
nhdr->b_freeze_cksum = NULL;
(void) refcount_add(&nhdr->b_refcnt, tag);
buf->b_hdr = nhdr;
- rw_exit(&buf->b_lock);
+ mutex_exit(&buf->b_evict_lock);
atomic_add_64(&arc_anon->arcs_size, blksz);
} else {
- rw_exit(&buf->b_lock);
+ mutex_exit(&buf->b_evict_lock);
ASSERT(refcount_count(&hdr->b_refcnt) == 1);
ASSERT(!list_link_active(&hdr->b_arc_node));
ASSERT(!HDR_IO_IN_PROGRESS(hdr));
- arc_change_state(arc_anon, hdr, hash_lock);
+ if (hdr->b_state != arc_anon)
+ arc_change_state(arc_anon, hdr, hash_lock);
hdr->b_arc_access = 0;
- mutex_exit(hash_lock);
+ if (hash_lock)
+ mutex_exit(hash_lock);
- bzero(&hdr->b_dva, sizeof (dva_t));
- hdr->b_birth = 0;
- hdr->b_cksum0 = 0;
+ buf_discard_identity(hdr);
arc_buf_thaw(buf);
}
buf->b_efunc = NULL;
}
}
+/*
+ * Release this buffer. If it does not match the provided BP, fill it
+ * with that block's contents.
+ */
+/* ARGSUSED */
+int
+arc_release_bp(arc_buf_t *buf, void *tag, blkptr_t *bp, spa_t *spa,
+ zbookmark_t *zb)
+{
+ arc_release(buf, tag);
+ return (0);
+}
+
int
arc_released(arc_buf_t *buf)
{
int released;
- rw_enter(&buf->b_lock, RW_READER);
+ mutex_enter(&buf->b_evict_lock);
released = (buf->b_data != NULL && buf->b_hdr->b_state == arc_anon);
- rw_exit(&buf->b_lock);
+ mutex_exit(&buf->b_evict_lock);
return (released);
}
{
int callback;
- rw_enter(&buf->b_lock, RW_READER);
+ mutex_enter(&buf->b_evict_lock);
callback = (buf->b_efunc != NULL);
- rw_exit(&buf->b_lock);
+ mutex_exit(&buf->b_evict_lock);
return (callback);
}
{
int referenced;
- rw_enter(&buf->b_lock, RW_READER);
+ mutex_enter(&buf->b_evict_lock);
referenced = (refcount_count(&buf->b_hdr->b_refcnt));
- rw_exit(&buf->b_lock);
+ mutex_exit(&buf->b_evict_lock);
return (referenced);
}
#endif
arc_buf_t *buf = callback->awcb_buf;
arc_buf_hdr_t *hdr = buf->b_hdr;
- hdr->b_acb = NULL;
+ ASSERT(hdr->b_acb == NULL);
+
+ if (zio->io_error == 0) {
+ hdr->b_dva = *BP_IDENTITY(zio->io_bp);
+ hdr->b_birth = BP_PHYSICAL_BIRTH(zio->io_bp);
+ hdr->b_cksum0 = zio->io_bp->blk_cksum.zc_word[0];
+ } else {
+ ASSERT(BUF_EMPTY(hdr));
+ }
- hdr->b_dva = *BP_IDENTITY(zio->io_bp);
- hdr->b_birth = zio->io_bp->blk_birth;
- hdr->b_cksum0 = zio->io_bp->blk_cksum.zc_word[0];
/*
* If the block to be written was all-zero, we may have
* compressed it away. In this case no write was performed
- * so there will be no dva/birth-date/checksum. The buffer
- * must therefor remain anonymous (and uncached).
+ * so there will be no dva/birth/checksum. The buffer must
+ * therefore remain anonymous (and uncached).
*/
if (!BUF_EMPTY(hdr)) {
arc_buf_hdr_t *exists;
kmutex_t *hash_lock;
+ ASSERT(zio->io_error == 0);
+
arc_cksum_verify(buf);
exists = buf_hash_insert(hdr, &hash_lock);
* sync-to-convergence, because we remove
* buffers from the hash table when we arc_free().
*/
- ASSERT(zio->io_flags & ZIO_FLAG_IO_REWRITE);
- ASSERT(DVA_EQUAL(BP_IDENTITY(&zio->io_bp_orig),
- BP_IDENTITY(zio->io_bp)));
- ASSERT3U(zio->io_bp_orig.blk_birth, ==,
- zio->io_bp->blk_birth);
-
- ASSERT(refcount_is_zero(&exists->b_refcnt));
- arc_change_state(arc_anon, exists, hash_lock);
- mutex_exit(hash_lock);
- arc_hdr_destroy(exists);
- exists = buf_hash_insert(hdr, &hash_lock);
- ASSERT3P(exists, ==, NULL);
+ if (zio->io_flags & ZIO_FLAG_IO_REWRITE) {
+ if (!BP_EQUAL(&zio->io_bp_orig, zio->io_bp))
+ panic("bad overwrite, hdr=%p exists=%p",
+ (void *)hdr, (void *)exists);
+ ASSERT(refcount_is_zero(&exists->b_refcnt));
+ arc_change_state(arc_anon, exists, hash_lock);
+ mutex_exit(hash_lock);
+ arc_hdr_destroy(exists);
+ exists = buf_hash_insert(hdr, &hash_lock);
+ ASSERT3P(exists, ==, NULL);
+ } else {
+ /* Dedup */
+ ASSERT(hdr->b_datacnt == 1);
+ ASSERT(hdr->b_state == arc_anon);
+ ASSERT(BP_GET_DEDUP(zio->io_bp));
+ ASSERT(BP_GET_LEVEL(zio->io_bp) == 0);
+ }
}
hdr->b_flags &= ~ARC_IO_IN_PROGRESS;
/* if it's not anon, we are doing a scrub */
- if (hdr->b_state == arc_anon)
+ if (!exists && hdr->b_state == arc_anon)
arc_access(hdr, hash_lock);
mutex_exit(hash_lock);
- } else if (callback->awcb_done == NULL) {
- int destroy_hdr;
- /*
- * This is an anonymous buffer with no user callback,
- * destroy it if there are no active references.
- */
- mutex_enter(&arc_eviction_mtx);
- destroy_hdr = refcount_is_zero(&hdr->b_refcnt);
- hdr->b_flags &= ~ARC_IO_IN_PROGRESS;
- mutex_exit(&arc_eviction_mtx);
- if (destroy_hdr)
- arc_hdr_destroy(hdr);
} else {
hdr->b_flags &= ~ARC_IO_IN_PROGRESS;
}
- hdr->b_flags &= ~ARC_STORED;
- if (callback->awcb_done) {
- ASSERT(!refcount_is_zero(&hdr->b_refcnt));
- callback->awcb_done(zio, buf, callback->awcb_private);
- }
+ ASSERT(!refcount_is_zero(&hdr->b_refcnt));
+ callback->awcb_done(zio, buf, callback->awcb_private);
kmem_free(callback, sizeof (arc_write_callback_t));
}
-void
-write_policy(spa_t *spa, const writeprops_t *wp, zio_prop_t *zp)
-{
- boolean_t ismd = (wp->wp_level > 0 || dmu_ot[wp->wp_type].ot_metadata);
-
- /* Determine checksum setting */
- if (ismd) {
- /*
- * Metadata always gets checksummed. If the data
- * checksum is multi-bit correctable, and it's not a
- * ZBT-style checksum, then it's suitable for metadata
- * as well. Otherwise, the metadata checksum defaults
- * to fletcher4.
- */
- if (zio_checksum_table[wp->wp_oschecksum].ci_correctable &&
- !zio_checksum_table[wp->wp_oschecksum].ci_zbt)
- zp->zp_checksum = wp->wp_oschecksum;
- else
- zp->zp_checksum = ZIO_CHECKSUM_FLETCHER_4;
- } else {
- zp->zp_checksum = zio_checksum_select(wp->wp_dnchecksum,
- wp->wp_oschecksum);
- }
-
- /* Determine compression setting */
- if (ismd) {
- /*
- * XXX -- we should design a compression algorithm
- * that specializes in arrays of bps.
- */
- zp->zp_compress = zfs_mdcomp_disable ? ZIO_COMPRESS_EMPTY :
- ZIO_COMPRESS_LZJB;
- } else {
- zp->zp_compress = zio_compress_select(wp->wp_dncompress,
- wp->wp_oscompress);
- }
-
- zp->zp_type = wp->wp_type;
- zp->zp_level = wp->wp_level;
- zp->zp_ndvas = MIN(wp->wp_copies + ismd, spa_max_replication(spa));
-}
-
zio_t *
-arc_write(zio_t *pio, spa_t *spa, const writeprops_t *wp,
- boolean_t l2arc, uint64_t txg, blkptr_t *bp, arc_buf_t *buf,
- arc_done_func_t *ready, arc_done_func_t *done, void *private, int priority,
- int zio_flags, const zbookmark_t *zb)
+arc_write(zio_t *pio, spa_t *spa, uint64_t txg,
+ blkptr_t *bp, arc_buf_t *buf, boolean_t l2arc, const zio_prop_t *zp,
+ arc_done_func_t *ready, arc_done_func_t *done, void *private,
+ int priority, int zio_flags, const zbookmark_t *zb)
{
arc_buf_hdr_t *hdr = buf->b_hdr;
arc_write_callback_t *callback;
zio_t *zio;
- zio_prop_t zp;
ASSERT(ready != NULL);
+ ASSERT(done != NULL);
ASSERT(!HDR_IO_ERROR(hdr));
ASSERT((hdr->b_flags & ARC_IO_IN_PROGRESS) == 0);
- ASSERT(hdr->b_acb == 0);
+ ASSERT(hdr->b_acb == NULL);
if (l2arc)
hdr->b_flags |= ARC_L2CACHE;
callback = kmem_zalloc(sizeof (arc_write_callback_t), KM_SLEEP);
callback->awcb_private = private;
callback->awcb_buf = buf;
- write_policy(spa, wp, &zp);
- zio = zio_write(pio, spa, txg, bp, buf->b_data, hdr->b_size, &zp,
+ zio = zio_write(pio, spa, txg, bp, buf->b_data, hdr->b_size, zp,
arc_write_ready, arc_write_done, callback, priority, zio_flags, zb);
return (zio);
}
-int
-arc_free(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
- zio_done_func_t *done, void *private, uint32_t arc_flags)
-{
- arc_buf_hdr_t *ab;
- kmutex_t *hash_lock;
- zio_t *zio;
-
- /*
- * If this buffer is in the cache, release it, so it
- * can be re-used.
- */
- ab = buf_hash_find(spa, BP_IDENTITY(bp), bp->blk_birth, &hash_lock);
- if (ab != NULL) {
- /*
- * The checksum of blocks to free is not always
- * preserved (eg. on the deadlist). However, if it is
- * nonzero, it should match what we have in the cache.
- */
- ASSERT(bp->blk_cksum.zc_word[0] == 0 ||
- bp->blk_cksum.zc_word[0] == ab->b_cksum0 ||
- bp->blk_fill == BLK_FILL_ALREADY_FREED);
-
- if (ab->b_state != arc_anon)
- arc_change_state(arc_anon, ab, hash_lock);
- if (HDR_IO_IN_PROGRESS(ab)) {
- /*
- * This should only happen when we prefetch.
- */
- ASSERT(ab->b_flags & ARC_PREFETCH);
- ASSERT3U(ab->b_datacnt, ==, 1);
- ab->b_flags |= ARC_FREED_IN_READ;
- if (HDR_IN_HASH_TABLE(ab))
- buf_hash_remove(ab);
- ab->b_arc_access = 0;
- bzero(&ab->b_dva, sizeof (dva_t));
- ab->b_birth = 0;
- ab->b_cksum0 = 0;
- ab->b_buf->b_efunc = NULL;
- ab->b_buf->b_private = NULL;
- mutex_exit(hash_lock);
- } else if (refcount_is_zero(&ab->b_refcnt)) {
- ab->b_flags |= ARC_FREE_IN_PROGRESS;
- mutex_exit(hash_lock);
- arc_hdr_destroy(ab);
- ARCSTAT_BUMP(arcstat_deleted);
- } else {
- /*
- * We still have an active reference on this
- * buffer. This can happen, e.g., from
- * dbuf_unoverride().
- */
- ASSERT(!HDR_IN_HASH_TABLE(ab));
- ab->b_arc_access = 0;
- bzero(&ab->b_dva, sizeof (dva_t));
- ab->b_birth = 0;
- ab->b_cksum0 = 0;
- ab->b_buf->b_efunc = NULL;
- ab->b_buf->b_private = NULL;
- mutex_exit(hash_lock);
- }
- }
-
- zio = zio_free(pio, spa, txg, bp, done, private, ZIO_FLAG_MUSTSUCCEED);
-
- if (arc_flags & ARC_WAIT)
- return (zio_wait(zio));
-
- ASSERT(arc_flags & ARC_NOWAIT);
- zio_nowait(zio);
-
- return (0);
-}
-
static int
-arc_memory_throttle(uint64_t reserve, uint64_t txg)
+arc_memory_throttle(uint64_t reserve, uint64_t inflight_data, uint64_t txg)
{
#ifdef _KERNEL
- uint64_t inflight_data = arc_anon->arcs_size;
uint64_t available_memory = ptob(freemem);
static uint64_t page_load = 0;
static uint64_t last_txg = 0;
arc_tempreserve_space(uint64_t reserve, uint64_t txg)
{
int error;
+ uint64_t anon_size;
#ifdef ZFS_DEBUG
/*
if (reserve > arc_c)
return (ENOMEM);
+ /*
+ * Don't count loaned bufs as in flight dirty data to prevent long
+ * network delays from blocking transactions that are ready to be
+ * assigned to a txg.
+ */
+ anon_size = MAX((int64_t)(arc_anon->arcs_size - arc_loaned_bytes), 0);
+
/*
* Writes will, almost always, require additional memory allocations
* in order to compress/encrypt/etc the data. We therefor need to
* make sure that there is sufficient available memory for this.
*/
- if (error = arc_memory_throttle(reserve, txg))
+ if ((error = arc_memory_throttle(reserve, anon_size, txg)))
return (error);
/*
* Note: if two requests come in concurrently, we might let them
* both succeed, when one of them should fail. Not a huge deal.
*/
- if (reserve + arc_tempreserve + arc_anon->arcs_size > arc_c / 2 &&
- arc_anon->arcs_size > arc_c / 4) {
+
+ if (reserve + arc_tempreserve + anon_size > arc_c / 2 &&
+ anon_size > arc_c / 4) {
dprintf("failing, arc_tempreserve=%lluK anon_meta=%lluK "
"anon_data=%lluK tempreserve=%lluK arc_c=%lluK\n",
arc_tempreserve>>10,
if (arc_c_min < arc_meta_limit / 2 && zfs_arc_min == 0)
arc_c_min = arc_meta_limit / 2;
+ if (zfs_arc_grow_retry > 0)
+ arc_grow_retry = zfs_arc_grow_retry;
+
+ if (zfs_arc_shrink_shift > 0)
+ arc_shrink_shift = zfs_arc_shrink_shift;
+
+ if (zfs_arc_p_min_shift > 0)
+ arc_p_min_shift = zfs_arc_p_min_shift;
+
/* if kmem_flags are set, lets try to use less memory */
if (kmem_debugging())
arc_c = arc_c / 2;
mutex_destroy(&arc_mru_ghost->arcs_mtx);
mutex_destroy(&arc_mfu->arcs_mtx);
mutex_destroy(&arc_mfu_ghost->arcs_mtx);
+ mutex_destroy(&arc_l2c_only->arcs_mtx);
mutex_destroy(&zfs_write_limit_lock);
buf_fini();
+
+ ASSERT(arc_loaned_bytes == 0);
}
/*
*
* Tunables may be removed or added as future performance improvements are
* integrated, and also may become zpool properties.
+ *
+ * There are three key functions that control how the L2ARC warms up:
+ *
+ * l2arc_write_eligible() check if a buffer is eligible to cache
+ * l2arc_write_size() calculate how much to write
+ * l2arc_write_interval() calculate sleep delay between writes
+ *
+ * These three functions determine what to write, how much, and how quickly
+ * to send writes.
*/
+static boolean_t
+l2arc_write_eligible(uint64_t spa_guid, arc_buf_hdr_t *ab)
+{
+ /*
+ * A buffer is *not* eligible for the L2ARC if it:
+ * 1. belongs to a different spa.
+ * 2. is already cached on the L2ARC.
+ * 3. has an I/O in progress (it may be an incomplete read).
+ * 4. is flagged not eligible (zfs property).
+ */
+ if (ab->b_spa != spa_guid || ab->b_l2hdr != NULL ||
+ HDR_IO_IN_PROGRESS(ab) || !HDR_L2CACHE(ab))
+ return (B_FALSE);
+
+ return (B_TRUE);
+}
+
+static uint64_t
+l2arc_write_size(l2arc_dev_t *dev)
+{
+ uint64_t size;
+
+ size = dev->l2ad_write;
+
+ if (arc_warm == B_FALSE)
+ size += dev->l2ad_boost;
+
+ return (size);
+
+}
+
+static clock_t
+l2arc_write_interval(clock_t began, uint64_t wanted, uint64_t wrote)
+{
+ clock_t interval, next, now;
+
+ /*
+ * If the ARC lists are busy, increase our write rate; if the
+ * lists are stale, idle back. This is achieved by checking
+ * how much we previously wrote - if it was more than half of
+ * what we wanted, schedule the next write much sooner.
+ */
+ if (l2arc_feed_again && wrote > (wanted / 2))
+ interval = (hz * l2arc_feed_min_ms) / 1000;
+ else
+ interval = hz * l2arc_feed_secs;
+
+ now = ddi_get_lbolt();
+ next = MAX(now, MIN(now + interval, began + interval));
+
+ return (next);
+}
+
static void
l2arc_hdr_stat_add(void)
{
* Free buffers that were tagged for destruction.
*/
static void
-l2arc_do_free_on_write()
+l2arc_do_free_on_write(void)
{
list_t *buflist;
l2arc_data_free_t *df, *df_prev;
ASSERT(cb != NULL);
buf = cb->l2rcb_buf;
ASSERT(buf != NULL);
- hdr = buf->b_hdr;
- ASSERT(hdr != NULL);
- hash_lock = HDR_LOCK(hdr);
+ hash_lock = HDR_LOCK(buf->b_hdr);
mutex_enter(hash_lock);
+ hdr = buf->b_hdr;
+ ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
/*
* Check this survived the L2ARC journey.
* storage now. If there *is* a waiter, the caller must
* issue the i/o in a context where it's OK to block.
*/
- if (zio->io_waiter == NULL)
- zio_nowait(zio_read(zio->io_parent,
- cb->l2rcb_spa, &cb->l2rcb_bp,
+ if (zio->io_waiter == NULL) {
+ zio_t *pio = zio_unique_parent(zio);
+
+ ASSERT(!pio || pio->io_child_type == ZIO_CHILD_LOGICAL);
+
+ zio_nowait(zio_read(pio, cb->l2rcb_spa, &cb->l2rcb_bp,
buf->b_data, zio->io_size, arc_read_done, buf,
zio->io_priority, cb->l2rcb_flags, &cb->l2rcb_zb));
+ }
}
kmem_free(cb, sizeof (l2arc_read_callback_t));
static list_t *
l2arc_list_locked(int list_num, kmutex_t **lock)
{
- list_t *list;
+ list_t *list = NULL;
ASSERT(list_num >= 0 && list_num <= 3);
}
mutex_exit(&l2arc_buflist_mtx);
- spa_l2cache_space_update(dev->l2ad_vdev, 0, -(taddr - dev->l2ad_evict));
+ vdev_space_update(dev->l2ad_vdev, -(taddr - dev->l2ad_evict), 0, 0);
dev->l2ad_evict = taddr;
}
* An ARC_L2_WRITING flag is set so that the L2ARC buffers are not valid
* for reading until they have completed writing.
*/
-static void
+static uint64_t
l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz)
{
arc_buf_hdr_t *ab, *ab_prev, *head;
list_t *list;
uint64_t passed_sz, write_sz, buf_sz, headroom;
void *buf_data;
- kmutex_t *hash_lock, *list_lock;
+ kmutex_t *hash_lock, *list_lock = NULL;
boolean_t have_lock, full;
l2arc_write_callback_t *cb;
zio_t *pio, *wzio;
+ uint64_t guid = spa_guid(spa);
+ int try;
ASSERT(dev->l2ad_vdev != NULL);
* Copy buffers for L2ARC writing.
*/
mutex_enter(&l2arc_buflist_mtx);
- for (int try = 0; try <= 3; try++) {
+ for (try = 0; try <= 3; try++) {
list = l2arc_list_locked(try, &list_lock);
passed_sz = 0;
break;
}
- if (ab->b_spa != spa) {
- mutex_exit(hash_lock);
- continue;
- }
-
- if (ab->b_l2hdr != NULL) {
- /*
- * Already in L2ARC.
- */
- mutex_exit(hash_lock);
- continue;
- }
-
- if (HDR_IO_IN_PROGRESS(ab) || !HDR_L2CACHE(ab)) {
+ if (!l2arc_write_eligible(guid, ab)) {
mutex_exit(hash_lock);
continue;
}
break;
}
- if (ab->b_buf == NULL) {
- DTRACE_PROBE1(l2arc__buf__null, void *, ab);
- mutex_exit(hash_lock);
- continue;
- }
-
if (pio == NULL) {
/*
* Insert a dummy header on the buflist so
if (pio == NULL) {
ASSERT3U(write_sz, ==, 0);
kmem_cache_free(hdr_cache, head);
- return;
+ return (0);
}
ASSERT3U(write_sz, <=, target_sz);
ARCSTAT_BUMP(arcstat_l2_writes_sent);
+ ARCSTAT_INCR(arcstat_l2_write_bytes, write_sz);
ARCSTAT_INCR(arcstat_l2_size, write_sz);
- spa_l2cache_space_update(dev->l2ad_vdev, 0, write_sz);
+ vdev_space_update(dev->l2ad_vdev, write_sz, 0, 0);
/*
* Bump device hand to the device start if it is approaching the end.
* l2arc_evict() will already have evicted ahead for this case.
*/
if (dev->l2ad_hand >= (dev->l2ad_end - target_sz)) {
- spa_l2cache_space_update(dev->l2ad_vdev, 0,
- dev->l2ad_end - dev->l2ad_hand);
+ vdev_space_update(dev->l2ad_vdev,
+ dev->l2ad_end - dev->l2ad_hand, 0, 0);
dev->l2ad_hand = dev->l2ad_start;
dev->l2ad_evict = dev->l2ad_start;
dev->l2ad_first = B_FALSE;
}
+ dev->l2ad_writing = B_TRUE;
(void) zio_wait(pio);
+ dev->l2ad_writing = B_FALSE;
+
+ return (write_sz);
}
/*
callb_cpr_t cpr;
l2arc_dev_t *dev;
spa_t *spa;
- uint64_t size;
+ uint64_t size, wrote;
+ clock_t begin, next = ddi_get_lbolt();
CALLB_CPR_INIT(&cpr, &l2arc_feed_thr_lock, callb_generic_cpr, FTAG);
mutex_enter(&l2arc_feed_thr_lock);
while (l2arc_thread_exit == 0) {
- /*
- * Pause for l2arc_feed_secs seconds between writes.
- */
CALLB_CPR_SAFE_BEGIN(&cpr);
(void) cv_timedwait(&l2arc_feed_thr_cv, &l2arc_feed_thr_lock,
- lbolt + (hz * l2arc_feed_secs));
+ next);
CALLB_CPR_SAFE_END(&cpr, &l2arc_feed_thr_lock);
+ next = ddi_get_lbolt() + hz;
/*
* Quick check for L2ARC devices.
continue;
}
mutex_exit(&l2arc_dev_mtx);
+ begin = ddi_get_lbolt();
/*
* This selects the next l2arc device to write to, and in
spa = dev->l2ad_spa;
ASSERT(spa != NULL);
+ /*
+ * If the pool is read-only then force the feed thread to
+ * sleep a little longer.
+ */
+ if (!spa_writeable(spa)) {
+ next = ddi_get_lbolt() + 5 * l2arc_feed_secs * hz;
+ spa_config_exit(spa, SCL_L2ARC, dev);
+ continue;
+ }
+
/*
* Avoid contributing to memory pressure.
*/
ARCSTAT_BUMP(arcstat_l2_feeds);
- size = dev->l2ad_write;
- if (arc_warm == B_FALSE)
- size += dev->l2ad_boost;
+ size = l2arc_write_size(dev);
/*
* Evict L2ARC buffers that will be overwritten.
/*
* Write ARC buffers.
*/
- l2arc_write_buffers(spa, dev, size);
+ wrote = l2arc_write_buffers(spa, dev, size);
+
+ /*
+ * Calculate interval between writes.
+ */
+ next = l2arc_write_interval(begin, size, wrote);
spa_config_exit(spa, SCL_L2ARC, dev);
}
* validated the vdev and opened it.
*/
void
-l2arc_add_vdev(spa_t *spa, vdev_t *vd, uint64_t start, uint64_t end)
+l2arc_add_vdev(spa_t *spa, vdev_t *vd)
{
l2arc_dev_t *adddev;
adddev->l2ad_vdev = vd;
adddev->l2ad_write = l2arc_write_max;
adddev->l2ad_boost = l2arc_write_boost;
- adddev->l2ad_start = start;
- adddev->l2ad_end = end;
+ adddev->l2ad_start = VDEV_LABEL_START_SIZE;
+ adddev->l2ad_end = VDEV_LABEL_START_SIZE + vdev_get_min_asize(vd);
adddev->l2ad_hand = adddev->l2ad_start;
adddev->l2ad_evict = adddev->l2ad_start;
adddev->l2ad_first = B_TRUE;
+ adddev->l2ad_writing = B_FALSE;
+ list_link_init(&adddev->l2ad_node);
ASSERT3U(adddev->l2ad_write, >, 0);
/*
list_create(adddev->l2ad_buflist, sizeof (arc_buf_hdr_t),
offsetof(arc_buf_hdr_t, b_l2node));
- spa_l2cache_space_update(vd, adddev->l2ad_end - adddev->l2ad_hand, 0);
+ vdev_space_update(vd, 0, 0, adddev->l2ad_end - adddev->l2ad_hand);
/*
* Add device to global list
void
l2arc_start(void)
{
- if (!(spa_mode & FWRITE))
+ if (!(spa_mode_global & FWRITE))
return;
(void) thread_create(NULL, 0, l2arc_feed_thread, NULL, 0, &p0,
void
l2arc_stop(void)
{
- if (!(spa_mode & FWRITE))
+ if (!(spa_mode_global & FWRITE))
return;
mutex_enter(&l2arc_feed_thr_lock);
cv_wait(&l2arc_feed_thr_cv, &l2arc_feed_thr_lock);
mutex_exit(&l2arc_feed_thr_lock);
}
+
+#if defined(_KERNEL) && defined(HAVE_SPL)
+EXPORT_SYMBOL(arc_read);
+EXPORT_SYMBOL(arc_buf_remove_ref);
+EXPORT_SYMBOL(arc_getbuf_func);
+
+module_param(zfs_arc_min, ulong, 0644);
+MODULE_PARM_DESC(zfs_arc_min, "Minimum arc size");
+
+module_param(zfs_arc_max, ulong, 0644);
+MODULE_PARM_DESC(zfs_arc_max, "Maximum arc size");
+
+module_param(zfs_arc_meta_limit, ulong, 0644);
+MODULE_PARM_DESC(zfs_arc_meta_limit, "Meta limit for arc size");
+#endif