]> git.proxmox.com Git - mirror_zfs.git/blobdiff - module/zfs/arc.c
Sequential scrub and resilvers
[mirror_zfs.git] / module / zfs / arc.c
index cd343b04e65dc61680e6bf9f303d19615087049f..6983576321d7d7e15fcd462862090a950b14218c 100644 (file)
@@ -357,7 +357,8 @@ int                 arc_no_grow_shift = 5;
  * minimum lifespan of a prefetch block in clock ticks
  * (initialized in arc_init())
  */
-static int             arc_min_prefetch_lifespan;
+static int             arc_min_prefetch_ms;
+static int             arc_min_prescient_prefetch_ms;
 
 /*
  * If this percent of memory is free, don't throttle.
@@ -407,7 +408,8 @@ unsigned long zfs_arc_dnode_limit_percent = 10;
  * These tunables are Linux specific
  */
 unsigned long zfs_arc_sys_free = 0;
-int zfs_arc_min_prefetch_lifespan = 0;
+int zfs_arc_min_prefetch_ms = 0;
+int zfs_arc_min_prescient_prefetch_ms = 0;
 int zfs_arc_p_aggressive_disable = 1;
 int zfs_arc_p_dampener_disable = 1;
 int zfs_arc_meta_prune = 10000;
@@ -663,6 +665,7 @@ typedef struct arc_stats {
        kstat_named_t arcstat_meta_min;
        kstat_named_t arcstat_sync_wait_for_async;
        kstat_named_t arcstat_demand_hit_predictive_prefetch;
+       kstat_named_t arcstat_demand_hit_prescient_prefetch;
        kstat_named_t arcstat_need_free;
        kstat_named_t arcstat_sys_free;
        kstat_named_t arcstat_raw_size;
@@ -762,6 +765,7 @@ static arc_stats_t arc_stats = {
        { "arc_meta_min",               KSTAT_DATA_UINT64 },
        { "sync_wait_for_async",        KSTAT_DATA_UINT64 },
        { "demand_hit_predictive_prefetch", KSTAT_DATA_UINT64 },
+       { "demand_hit_prescient_prefetch", KSTAT_DATA_UINT64 },
        { "arc_need_free",              KSTAT_DATA_UINT64 },
        { "arc_sys_free",               KSTAT_DATA_UINT64 },
        { "arc_raw_size",               KSTAT_DATA_UINT64 }
@@ -861,6 +865,8 @@ static taskq_t *arc_prune_taskq;
 #define        HDR_IO_IN_PROGRESS(hdr) ((hdr)->b_flags & ARC_FLAG_IO_IN_PROGRESS)
 #define        HDR_IO_ERROR(hdr)       ((hdr)->b_flags & ARC_FLAG_IO_ERROR)
 #define        HDR_PREFETCH(hdr)       ((hdr)->b_flags & ARC_FLAG_PREFETCH)
+#define        HDR_PRESCIENT_PREFETCH(hdr)     \
+       ((hdr)->b_flags & ARC_FLAG_PRESCIENT_PREFETCH)
 #define        HDR_COMPRESSION_ENABLED(hdr)    \
        ((hdr)->b_flags & ARC_FLAG_COMPRESSED_ARC)
 
@@ -3778,6 +3784,8 @@ arc_evict_hdr(arc_buf_hdr_t *hdr, kmutex_t *hash_lock)
 {
        arc_state_t *evicted_state, *state;
        int64_t bytes_evicted = 0;
+       int min_lifetime = HDR_PRESCIENT_PREFETCH(hdr) ?
+           arc_min_prescient_prefetch_ms : arc_min_prefetch_ms;
 
        ASSERT(MUTEX_HELD(hash_lock));
        ASSERT(HDR_HAS_L1HDR(hdr));
@@ -3831,8 +3839,7 @@ arc_evict_hdr(arc_buf_hdr_t *hdr, kmutex_t *hash_lock)
        /* prefetch buffers have a minimum lifespan */
        if (HDR_IO_IN_PROGRESS(hdr) ||
            ((hdr->b_flags & (ARC_FLAG_PREFETCH | ARC_FLAG_INDIRECT)) &&
-           ddi_get_lbolt() - hdr->b_l1hdr.b_arc_access <
-           arc_min_prefetch_lifespan)) {
+           ddi_get_lbolt() - hdr->b_l1hdr.b_arc_access < min_lifetime * hz)) {
                ARCSTAT_BUMP(arcstat_evict_skip);
                return (bytes_evicted);
        }
@@ -5492,13 +5499,15 @@ arc_access(arc_buf_hdr_t *hdr, kmutex_t *hash_lock)
                 * - move the buffer to the head of the list if this is
                 *   another prefetch (to make it less likely to be evicted).
                 */
-               if (HDR_PREFETCH(hdr)) {
+               if (HDR_PREFETCH(hdr) || HDR_PRESCIENT_PREFETCH(hdr)) {
                        if (refcount_count(&hdr->b_l1hdr.b_refcnt) == 0) {
                                /* link protected by hash lock */
                                ASSERT(multilist_link_active(
                                    &hdr->b_l1hdr.b_arc_node));
                        } else {
-                               arc_hdr_clear_flags(hdr, ARC_FLAG_PREFETCH);
+                               arc_hdr_clear_flags(hdr,
+                                   ARC_FLAG_PREFETCH |
+                                   ARC_FLAG_PRESCIENT_PREFETCH);
                                atomic_inc_32(&hdr->b_l1hdr.b_mru_hits);
                                ARCSTAT_BUMP(arcstat_mru_hits);
                        }
@@ -5532,10 +5541,13 @@ arc_access(arc_buf_hdr_t *hdr, kmutex_t *hash_lock)
                 * MFU state.
                 */
 
-               if (HDR_PREFETCH(hdr)) {
+               if (HDR_PREFETCH(hdr) || HDR_PRESCIENT_PREFETCH(hdr)) {
                        new_state = arc_mru;
-                       if (refcount_count(&hdr->b_l1hdr.b_refcnt) > 0)
-                               arc_hdr_clear_flags(hdr, ARC_FLAG_PREFETCH);
+                       if (refcount_count(&hdr->b_l1hdr.b_refcnt) > 0) {
+                               arc_hdr_clear_flags(hdr,
+                                   ARC_FLAG_PREFETCH |
+                                   ARC_FLAG_PRESCIENT_PREFETCH);
+                       }
                        DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, hdr);
                } else {
                        new_state = arc_mfu;
@@ -5557,11 +5569,7 @@ arc_access(arc_buf_hdr_t *hdr, kmutex_t *hash_lock)
                 * If it was a prefetch, we will explicitly move it to
                 * the head of the list now.
                 */
-               if ((HDR_PREFETCH(hdr)) != 0) {
-                       ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
-                       /* link protected by hash_lock */
-                       ASSERT(multilist_link_active(&hdr->b_l1hdr.b_arc_node));
-               }
+
                atomic_inc_32(&hdr->b_l1hdr.b_mfu_hits);
                ARCSTAT_BUMP(arcstat_mfu_hits);
                hdr->b_l1hdr.b_arc_access = ddi_get_lbolt();
@@ -5573,12 +5581,11 @@ arc_access(arc_buf_hdr_t *hdr, kmutex_t *hash_lock)
                 * MFU state.
                 */
 
-               if (HDR_PREFETCH(hdr)) {
+               if (HDR_PREFETCH(hdr) || HDR_PRESCIENT_PREFETCH(hdr)) {
                        /*
                         * This is a prefetch access...
                         * move this block back to the MRU state.
                         */
-                       ASSERT0(refcount_count(&hdr->b_l1hdr.b_refcnt));
                        new_state = arc_mru;
                }
 
@@ -5605,20 +5612,25 @@ arc_access(arc_buf_hdr_t *hdr, kmutex_t *hash_lock)
 /* a generic arc_read_done_func_t which you can use */
 /* ARGSUSED */
 void
-arc_bcopy_func(zio_t *zio, int error, arc_buf_t *buf, void *arg)
+arc_bcopy_func(zio_t *zio, const zbookmark_phys_t *zb, const blkptr_t *bp,
+    arc_buf_t *buf, void *arg)
 {
-       if (error == 0)
-               bcopy(buf->b_data, arg, arc_buf_size(buf));
+       if (buf == NULL)
+               return;
+
+       bcopy(buf->b_data, arg, arc_buf_size(buf));
        arc_buf_destroy(buf, arg);
 }
 
 /* a generic arc_read_done_func_t */
+/* ARGSUSED */
 void
-arc_getbuf_func(zio_t *zio, int error, arc_buf_t *buf, void *arg)
+arc_getbuf_func(zio_t *zio, const zbookmark_phys_t *zb, const blkptr_t *bp,
+    arc_buf_t *buf, void *arg)
 {
        arc_buf_t **bufp = arg;
-       if (error != 0) {
-               arc_buf_destroy(buf, arg);
+
+       if (buf == NULL) {
                *bufp = NULL;
        } else {
                *bufp = buf;
@@ -5652,7 +5664,6 @@ arc_read_done(zio_t *zio)
        arc_callback_t  *callback_list;
        arc_callback_t  *acb;
        boolean_t       freeable = B_FALSE;
-       boolean_t       no_zio_error = (zio->io_error == 0);
 
        /*
         * The hdr was inserted into hash-table and removed from lists
@@ -5699,7 +5710,7 @@ arc_read_done(zio_t *zio)
                }
        }
 
-       if (no_zio_error) {
+       if (zio->io_error == 0) {
                /* byteswap if necessary */
                if (BP_SHOULD_BYTESWAP(zio->io_bp)) {
                        if (BP_GET_LEVEL(zio->io_bp) > 0) {
@@ -5720,7 +5731,8 @@ arc_read_done(zio_t *zio)
        callback_list = hdr->b_l1hdr.b_acb;
        ASSERT3P(callback_list, !=, NULL);
 
-       if (hash_lock && no_zio_error && hdr->b_l1hdr.b_state == arc_anon) {
+       if (hash_lock && zio->io_error == 0 &&
+           hdr->b_l1hdr.b_state == arc_anon) {
                /*
                 * Only call arc_access on anonymous buffers.  This is because
                 * if we've issued an I/O for an evicted buffer, we've already
@@ -5741,13 +5753,19 @@ arc_read_done(zio_t *zio)
                if (!acb->acb_done)
                        continue;
 
-               /* This is a demand read since prefetches don't use callbacks */
                callback_cnt++;
 
+               if (zio->io_error != 0)
+                       continue;
+
                int error = arc_buf_alloc_impl(hdr, zio->io_spa,
                    acb->acb_dsobj, acb->acb_private, acb->acb_encrypted,
-                   acb->acb_compressed, acb->acb_noauth, no_zio_error,
+                   acb->acb_compressed, acb->acb_noauth, B_TRUE,
                    &acb->acb_buf);
+               if (error != 0) {
+                       arc_buf_destroy(acb->acb_buf, acb->acb_private);
+                       acb->acb_buf = NULL;
+               }
 
                /*
                 * Assert non-speculative zios didn't fail because an
@@ -5770,9 +5788,8 @@ arc_read_done(zio_t *zio)
                        }
                }
 
-               if (no_zio_error) {
+               if (zio->io_error == 0)
                        zio->io_error = error;
-               }
        }
        hdr->b_l1hdr.b_acb = NULL;
        arc_hdr_clear_flags(hdr, ARC_FLAG_IO_IN_PROGRESS);
@@ -5782,7 +5799,7 @@ arc_read_done(zio_t *zio)
        ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt) ||
            callback_list != NULL);
 
-       if (no_zio_error) {
+       if (zio->io_error == 0) {
                arc_hdr_verify(hdr, zio->io_bp);
        } else {
                arc_hdr_set_flags(hdr, ARC_FLAG_IO_ERROR);
@@ -5816,8 +5833,8 @@ arc_read_done(zio_t *zio)
        /* execute each callback and free its structure */
        while ((acb = callback_list) != NULL) {
                if (acb->acb_done) {
-                       acb->acb_done(zio, zio->io_error, acb->acb_buf,
-                           acb->acb_private);
+                       acb->acb_done(zio, &zio->io_bookmark, zio->io_bp,
+                           acb->acb_buf, acb->acb_private);
                }
 
                if (acb->acb_zio_dummy != NULL) {
@@ -5974,12 +5991,25 @@ top:
                                arc_hdr_clear_flags(hdr,
                                    ARC_FLAG_PREDICTIVE_PREFETCH);
                        }
+
+                       if (hdr->b_flags & ARC_FLAG_PRESCIENT_PREFETCH) {
+                               ARCSTAT_BUMP(
+                                   arcstat_demand_hit_prescient_prefetch);
+                               arc_hdr_clear_flags(hdr,
+                                   ARC_FLAG_PRESCIENT_PREFETCH);
+                       }
+
                        ASSERT(!BP_IS_EMBEDDED(bp) || !BP_IS_HOLE(bp));
 
                        /* Get a buf with the desired data in it. */
                        rc = arc_buf_alloc_impl(hdr, spa, zb->zb_objset,
                            private, encrypted_read, compressed_read,
                            noauth_read, B_TRUE, &buf);
+                       if (rc != 0) {
+                               arc_buf_destroy(buf, private);
+                               buf = NULL;
+                       }
+
                        ASSERT((zio_flags & ZIO_FLAG_SPECULATIVE) || rc == 0);
                } else if (*arc_flags & ARC_FLAG_PREFETCH &&
                    refcount_count(&hdr->b_l1hdr.b_refcnt) == 0) {
@@ -5987,6 +6017,8 @@ top:
                }
                DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr);
                arc_access(hdr, hash_lock);
+               if (*arc_flags & ARC_FLAG_PRESCIENT_PREFETCH)
+                       arc_hdr_set_flags(hdr, ARC_FLAG_PRESCIENT_PREFETCH);
                if (*arc_flags & ARC_FLAG_L2CACHE)
                        arc_hdr_set_flags(hdr, ARC_FLAG_L2CACHE);
                mutex_exit(hash_lock);
@@ -5996,7 +6028,7 @@ top:
                    data, metadata, hits);
 
                if (done)
-                       done(NULL, rc, buf, private);
+                       done(NULL, zb, bp, buf, private);
        } else {
                uint64_t lsize = BP_GET_LSIZE(bp);
                uint64_t psize = BP_GET_PSIZE(bp);
@@ -6112,6 +6144,8 @@ top:
                if (*arc_flags & ARC_FLAG_PREFETCH &&
                    refcount_is_zero(&hdr->b_l1hdr.b_refcnt))
                        arc_hdr_set_flags(hdr, ARC_FLAG_PREFETCH);
+               if (*arc_flags & ARC_FLAG_PRESCIENT_PREFETCH)
+                       arc_hdr_set_flags(hdr, ARC_FLAG_PRESCIENT_PREFETCH);
                if (*arc_flags & ARC_FLAG_L2CACHE)
                        arc_hdr_set_flags(hdr, ARC_FLAG_L2CACHE);
                if (BP_IS_AUTHENTICATED(bp))
@@ -7223,9 +7257,15 @@ arc_tuning_update(void)
        if (zfs_arc_p_min_shift)
                arc_p_min_shift = zfs_arc_p_min_shift;
 
-       /* Valid range: 1 - N ticks */
-       if (zfs_arc_min_prefetch_lifespan)
-               arc_min_prefetch_lifespan = zfs_arc_min_prefetch_lifespan;
+       /* Valid range: 1 - N ms */
+       if (zfs_arc_min_prefetch_ms)
+               arc_min_prefetch_ms = zfs_arc_min_prefetch_ms;
+
+       /* Valid range: 1 - N ms */
+       if (zfs_arc_min_prescient_prefetch_ms) {
+               arc_min_prescient_prefetch_ms =
+                   zfs_arc_min_prescient_prefetch_ms;
+       }
 
        /* Valid range: 0 - 100 */
        if ((zfs_arc_lotsfree_percent >= 0) &&
@@ -7368,7 +7408,8 @@ arc_init(void)
        cv_init(&arc_reclaim_waiters_cv, NULL, CV_DEFAULT, NULL);
 
        /* Convert seconds to clock ticks */
-       arc_min_prefetch_lifespan = 1 * hz;
+       arc_min_prefetch_ms = 1;
+       arc_min_prescient_prefetch_ms = 6;
 
 #ifdef _KERNEL
        /*
@@ -9006,8 +9047,12 @@ MODULE_PARM_DESC(zfs_arc_average_blocksize, "Target average block size");
 module_param(zfs_compressed_arc_enabled, int, 0644);
 MODULE_PARM_DESC(zfs_compressed_arc_enabled, "Disable compressed arc buffers");
 
-module_param(zfs_arc_min_prefetch_lifespan, int, 0644);
-MODULE_PARM_DESC(zfs_arc_min_prefetch_lifespan, "Min life of prefetch block");
+module_param(zfs_arc_min_prefetch_ms, int, 0644);
+MODULE_PARM_DESC(zfs_arc_min_prefetch_ms, "Min life of prefetch block in ms");
+
+module_param(zfs_arc_min_prescient_prefetch_ms, int, 0644);
+MODULE_PARM_DESC(zfs_arc_min_prescient_prefetch_ms,
+       "Min life of prescient prefetched block in ms");
 
 module_param(l2arc_write_max, ulong, 0644);
 MODULE_PARM_DESC(l2arc_write_max, "Max write bytes per interval");