]> git.proxmox.com Git - mirror_zfs.git/commitdiff
Reset l2ad_hand and l2ad_first in l2arc_evict
authorGeorge Amanakis <gamanakis@gmail.com>
Tue, 31 Mar 2020 17:46:48 +0000 (13:46 -0400)
committerGitHub <noreply@github.com>
Tue, 31 Mar 2020 17:46:48 +0000 (10:46 -0700)
Increasing l2arc_write_size or l2arc_write_boost can result in
l2arc_write_buffers() not having enough space to perform its writes and
panic zio_write_phys().

Instead of resetting l2ad_hand to l2ad_start at the end of
l2arc_write_buffers() and not taking into account a possible
user-mediated increase of l2arc_write_max, we do this in l2arc_evict(),
right after l2arc_write_size() has run. If there is not enough space to
evict (ie we will exceed l2ad_end) we evict to the end of the device,
reset l2ad_hand to l2ad_start, set l2ad_first to 0 and iterate
l2arc_evict(). We avoid infinite iteration of l2arc_evict() by making
sure in l2arc_write_size() that l2ad_start + size does not exceed
l2ad_end.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Ryan Moeller <ryan@iXsystems.com>
Signed-off-by: George Amanakis <gamanakis@gmail.com>
Closes #10154

module/zfs/arc.c
tests/runfiles/common.run
tests/zfs-tests/include/libtest.shlib
tests/zfs-tests/include/tunables.cfg
tests/zfs-tests/tests/functional/cache/Makefile.am
tests/zfs-tests/tests/functional/cache/cache_012_pos.ksh [new file with mode: 0755]

index 8a0c1a4a7e2d4dffe998efe9e88e3b2b5e2f1789..9c5ee9829ca1b5194d0830cdd9d8ac642eafb594 100644 (file)
@@ -7467,9 +7467,9 @@ l2arc_write_eligible(uint64_t spa_guid, arc_buf_hdr_t *hdr)
 }
 
 static uint64_t
-l2arc_write_size(void)
+l2arc_write_size(l2arc_dev_t *dev)
 {
-       uint64_t size;
+       uint64_t size, dev_size;
 
        /*
         * Make sure our globals have meaningful values in case the user
@@ -7486,6 +7486,23 @@ l2arc_write_size(void)
        if (arc_warm == B_FALSE)
                size += l2arc_write_boost;
 
+       /*
+        * Make sure the write size does not exceed the size of the cache
+        * device. This is important in l2arc_evict(), otherwise infinite
+        * iteration can occur.
+        */
+       dev_size = dev->l2ad_end - dev->l2ad_start;
+       if (size >= dev_size) {
+               cmn_err(CE_NOTE, "l2arc_write_max or l2arc_write_boost "
+                   "exceeds the size of the cache device (guid %llu), "
+                   "resetting them to the default (%d)",
+                   dev->l2ad_vdev->vdev_guid, L2ARC_WRITE_SIZE);
+               size = l2arc_write_max = l2arc_write_boost = L2ARC_WRITE_SIZE;
+
+               if (arc_warm == B_FALSE)
+                       size += l2arc_write_boost;
+       }
+
        return (size);
 
 }
@@ -8008,22 +8025,22 @@ l2arc_evict(l2arc_dev_t *dev, uint64_t distance, boolean_t all)
        arc_buf_hdr_t *hdr, *hdr_prev;
        kmutex_t *hash_lock;
        uint64_t taddr;
+       boolean_t rerun;
 
        buflist = &dev->l2ad_buflist;
 
-       if (!all && dev->l2ad_first) {
-               /*
-                * This is the first sweep through the device.  There is
-                * nothing to evict.
-                */
-               return;
-       }
-
-       if (dev->l2ad_hand >= (dev->l2ad_end - (2 * distance))) {
+top:
+       rerun = B_FALSE;
+       if (dev->l2ad_hand >= (dev->l2ad_end - distance)) {
                /*
-                * When nearing the end of the device, evict to the end
-                * before the device write hand jumps to the start.
+                * When there is no space to accomodate upcoming writes,
+                * evict to the end. Then bump the write hand to the start
+                * and iterate. This iteration does not happen indefinitely
+                * as we make sure in l2arc_write_size() that when l2ad_hand
+                * is reset, the write size does not exceed the end of the
+                * device.
                 */
+               rerun = B_TRUE;
                taddr = dev->l2ad_end;
        } else {
                taddr = dev->l2ad_hand + distance;
@@ -8031,7 +8048,15 @@ l2arc_evict(l2arc_dev_t *dev, uint64_t distance, boolean_t all)
        DTRACE_PROBE4(l2arc__evict, l2arc_dev_t *, dev, list_t *, buflist,
            uint64_t, taddr, boolean_t, all);
 
-top:
+       if (!all && dev->l2ad_first) {
+               /*
+                * This is the first sweep through the device.  There is
+                * nothing to evict.
+                */
+               goto out;
+       }
+
+retry:
        mutex_enter(&dev->l2ad_mtx);
        for (hdr = list_tail(buflist); hdr; hdr = hdr_prev) {
                hdr_prev = list_prev(buflist, hdr);
@@ -8052,7 +8077,7 @@ top:
                        mutex_exit(&dev->l2ad_mtx);
                        mutex_enter(hash_lock);
                        mutex_exit(hash_lock);
-                       goto top;
+                       goto retry;
                }
 
                /*
@@ -8101,6 +8126,17 @@ top:
                mutex_exit(hash_lock);
        }
        mutex_exit(&dev->l2ad_mtx);
+
+out:
+       if (rerun) {
+               /*
+                * Bump device hand to the device start if it is approaching the
+                * end. l2arc_evict() has already evicted ahead for this case.
+                */
+               dev->l2ad_hand = dev->l2ad_start;
+               dev->l2ad_first = B_FALSE;
+               goto top;
+       }
 }
 
 /*
@@ -8448,15 +8484,6 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz)
        ARCSTAT_INCR(arcstat_l2_lsize, write_lsize);
        ARCSTAT_INCR(arcstat_l2_psize, write_psize);
 
-       /*
-        * Bump device hand to the device start if it is approaching the end.
-        * l2arc_evict() will already have evicted ahead for this case.
-        */
-       if (dev->l2ad_hand >= (dev->l2ad_end - target_sz)) {
-               dev->l2ad_hand = dev->l2ad_start;
-               dev->l2ad_first = B_FALSE;
-       }
-
        dev->l2ad_writing = B_TRUE;
        (void) zio_wait(pio);
        dev->l2ad_writing = B_FALSE;
@@ -8539,7 +8566,7 @@ l2arc_feed_thread(void *unused)
 
                ARCSTAT_BUMP(arcstat_l2_feeds);
 
-               size = l2arc_write_size();
+               size = l2arc_write_size(dev);
 
                /*
                 * Evict L2ARC buffers that will be overwritten.
index 96700becbac733982fc2e6cd16e2fed1aa7b7be4..84ea70f07e3ad3a5bbcfd1f643eee1747d280bdc 100644 (file)
@@ -49,7 +49,7 @@ post =
 [tests/functional/cache]
 tests = ['cache_001_pos', 'cache_002_pos', 'cache_003_pos', 'cache_004_neg',
     'cache_005_neg', 'cache_006_pos', 'cache_007_neg', 'cache_008_neg',
-    'cache_009_pos', 'cache_010_neg', 'cache_011_pos']
+    'cache_009_pos', 'cache_010_neg', 'cache_011_pos', 'cache_012_pos']
 tags = ['functional', 'cache']
 
 [tests/functional/cachefile]
index 1f24b4271d85402017469325f5b72c8bfb9e092e..a641a0b7a7e3484e5e538bf89ad17e097c83b993 100644 (file)
@@ -4053,3 +4053,16 @@ function ls_xattr # path
                ;;
        esac
 }
+
+function get_arcstat # stat
+{
+       if is_linux; then
+               typeset stat=$1
+               typeset zfs_arcstats="/proc/spl/kstat/zfs/arcstats"
+               [[ -f "$zfs_arcstats" ]] || return 1
+               grep $stat $zfs_arcstats | awk '{print $3}'
+               return $?
+       else
+               return 1
+       fi
+}
index 01176c78112b143d2ca7ac1a46a99f8085b1156f..62d335abebc5becdcbb58d45671e79fd42f39037 100644 (file)
@@ -35,6 +35,9 @@ DISABLE_IVSET_GUID_CHECK      disable_ivset_guid_check        zfs_disable_ivset_guid_check
 INITIALIZE_CHUNK_SIZE          initialize_chunk_size           zfs_initialize_chunk_size
 INITIALIZE_VALUE               initialize_value                zfs_initialize_value
 KEEP_LOG_SPACEMAPS_AT_EXPORT   keep_log_spacemaps_at_export    zfs_keep_log_spacemaps_at_export
+L2ARC_NOPREFETCH               l2arc.noprefetch                l2arc_noprefetch
+L2ARC_WRITE_BOOST              l2arc.write_boost               l2arc_write_boost
+L2ARC_WRITE_MAX                        l2arc.write_max                 l2arc_write_max
 LIVELIST_CONDENSE_NEW_ALLOC    livelist.condense.new_alloc     zfs_livelist_condense_new_alloc
 LIVELIST_CONDENSE_SYNC_CANCEL  livelist.condense.sync_cancel   zfs_livelist_condense_sync_cancel
 LIVELIST_CONDENSE_SYNC_PAUSE   livelist.condense.sync_pause    zfs_livelist_condense_sync_pause
index 18dd9c1985ebde8728a87890a6de1f413f0465ab..406a9281709a4c04f10b64ca1a7847d611f8e4b0 100644 (file)
@@ -12,7 +12,8 @@ dist_pkgdata_SCRIPTS = \
        cache_008_neg.ksh \
        cache_009_pos.ksh \
        cache_010_neg.ksh \
-       cache_011_pos.ksh
+       cache_011_pos.ksh \
+       cache_012_pos.ksh
 
 dist_pkgdata_DATA = \
        cache.cfg \
diff --git a/tests/zfs-tests/tests/functional/cache/cache_012_pos.ksh b/tests/zfs-tests/tests/functional/cache/cache_012_pos.ksh
new file mode 100755 (executable)
index 0000000..edefe9c
--- /dev/null
@@ -0,0 +1,110 @@
+#!/bin/ksh -p
+#
+# CDDL HEADER START
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source.  A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+# CDDL HEADER END
+#
+
+#
+# Copyright (c) 2020, George Amanakis. All rights reserved.
+#
+
+. $STF_SUITE/tests/functional/cache/cache.cfg
+. $STF_SUITE/tests/functional/cache/cache.kshlib
+
+#
+# DESCRIPTION:
+#      Looping around a cache device with l2arc_write_size exceeding
+#      the device size succeeds.
+#
+# STRATEGY:
+#      1. Create pool with a cache device.
+#      2. Set l2arc_write_max to a value larger than the cache device.
+#      3. Create a file larger than the cache device and random read
+#              for 10 sec.
+#      4. Verify that l2arc_write_max is set back to the default.
+#      5. Set l2arc_write_max to a value less than the cache device size but
+#              larger than the default (64MB).
+#      6. Record the l2_size.
+#      7. Random read for 1 sec.
+#      8. Record the l2_size again.
+#      9. If (6) <= (8) then we have not looped around yet.
+#      10. If (6) > (8) then we looped around. Break out of the loop and test.
+#      11. Destroy pool.
+#
+
+verify_runnable "global"
+
+log_assert "Looping around a cache device succeeds."
+
+function cleanup
+{
+       if poolexists $TESTPOOL ; then
+               destroy_pool $TESTPOOL
+       fi
+
+       log_must set_tunable32 L2ARC_WRITE_MAX $write_max
+       log_must set_tunable32 L2ARC_NOPREFETCH $noprefetch
+}
+log_onexit cleanup
+
+typeset write_max=$(get_tunable L2ARC_WRITE_MAX)
+typeset noprefetch=$(get_tunable L2ARC_NOPREFETCH)
+log_must set_tunable32 L2ARC_NOPREFETCH 0
+
+typeset VDEV="$VDIR/vdev.disk"
+typeset VDEV_SZ=$(( 4 * 1024 * 1024 * 1024 ))
+typeset VCACHE="$VDIR/vdev.cache"
+typeset VCACHE_SZ=$(( $VDEV_SZ / 2 ))
+
+typeset fill_mb=$(( floor($VDEV_SZ * 3 / 4 ) ))
+export DIRECTORY=/$TESTPOOL
+export NUMJOBS=4
+export RUNTIME=10
+export PERF_RANDSEED=1234
+export PERF_COMPPERCENT=66
+export PERF_COMPCHUNK=0
+export BLOCKSIZE=128K
+export SYNC_TYPE=0
+export DIRECT=1
+export FILE_SIZE=$(( floor($fill_mb / $NUMJOBS) ))
+
+log_must set_tunable32 L2ARC_WRITE_MAX $(( $VCACHE_SZ * 2 ))
+
+log_must truncate -s $VCACHE_SZ $VCACHE
+log_must truncate -s $VDEV_SZ $VDEV
+
+log_must zpool create -f $TESTPOOL $VDEV cache $VCACHE
+
+log_must fio $FIO_SCRIPTS/mkfiles.fio
+log_must fio $FIO_SCRIPTS/random_reads.fio
+
+typeset write_max2=$(get_tunable L2ARC_WRITE_MAX)
+
+log_must test $write_max2 -eq $write_max
+
+log_must set_tunable32 L2ARC_WRITE_MAX $(( 64 * 1024 * 1024 ))
+export RUNTIME=1
+
+typeset do_once=true
+while $do_once || [[ $l2_size1 -le $l2_size2 ]]; do
+       typeset l2_size1=$(get_arcstat l2_size)
+       log_must fio $FIO_SCRIPTS/random_reads.fio
+       typeset l2_size2=$(get_arcstat l2_size)
+       do_once=false
+done
+
+log_must test $l2_size1 -gt $l2_size2
+
+log_must zpool destroy $TESTPOOL
+
+log_pass "Looping around a cache device succeeds."