]> git.proxmox.com Git - mirror_zfs.git/commitdiff
ZIL: Detect single-threaded workloads
authorAlexander Motin <mav@FreeBSD.org>
Tue, 24 Oct 2023 21:35:25 +0000 (17:35 -0400)
committerGitHub <noreply@github.com>
Tue, 24 Oct 2023 21:35:25 +0000 (14:35 -0700)
... by checking that previous block is fully written and flushed.
It allows to skip commit delays since we can give up on aggregation
in that case.  This removes zil_min_commit_timeout parameter, since
for single-threaded workloads it is not needed at all, while on very
fast devices even some multi-threaded workloads may get detected as
single-threaded and still bypass the wait.  To give multi-threaded
workloads more aggregation chances increase zfs_commit_timeout_pct
from 5 to 10%, as they should suffer less from additional latency.

Also single-threaded workloads detection allows in perspective better
prediction of the next block size.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Prakash Surya <prakash.surya@delphix.com>
Signed-off-by: Alexander Motin <mav@FreeBSD.org>
Sponsored by: iXsystems, Inc.
Closes #15381

include/sys/zil_impl.h
man/man4/zfs.4
module/zfs/zil.c

index f780ad3d61bc1012792633eddcb4639180994353..c9db6d428ea2526e7a409f6d3ecb44d223d09855 100644 (file)
@@ -181,6 +181,7 @@ typedef struct zil_vdev_node {
        avl_node_t      zv_node;        /* AVL tree linkage */
 } zil_vdev_node_t;
 
+#define        ZIL_BURSTS 8
 #define        ZIL_PREV_BLKS 16
 
 /*
@@ -222,8 +223,9 @@ struct zilog {
        clock_t         zl_replay_time; /* lbolt of when replay started */
        uint64_t        zl_replay_blks; /* number of log blocks replayed */
        zil_header_t    zl_old_header;  /* debugging aid */
-       uint_t          zl_prev_blks[ZIL_PREV_BLKS]; /* size - sector rounded */
+       uint_t          zl_parallel;    /* workload is multi-threaded */
        uint_t          zl_prev_rotor;  /* rotor for zl_prev[] */
+       uint_t          zl_prev_blks[ZIL_PREV_BLKS]; /* size - sector rounded */
        txg_node_t      zl_dirty_link;  /* protected by dp_dirty_zilogs list */
        uint64_t        zl_dirty_max_txg; /* highest txg used to dirty zilog */
 
index 5f89f6adf1e33816a397193d8e3e37a7e3f873a8..7073cf4eecc87acf5ce0e7e58daec2749ba8f671 100644 (file)
@@ -798,7 +798,7 @@ Note that this should not be set below the ZED thresholds
 (currently 10 checksums over 10 seconds)
 or else the daemon may not trigger any action.
 .
-.It Sy zfs_commit_timeout_pct Ns = Ns Sy 5 Ns % Pq uint
+.It Sy zfs_commit_timeout_pct Ns = Ns Sy 10 Ns % Pq uint
 This controls the amount of time that a ZIL block (lwb) will remain "open"
 when it isn't "full", and it has a thread waiting for it to be committed to
 stable storage.
@@ -2155,13 +2155,6 @@ This sets the maximum number of write bytes logged via WR_COPIED.
 It tunes a tradeoff between additional memory copy and possibly worse log
 space efficiency vs additional range lock/unlock.
 .
-.It Sy zil_min_commit_timeout Ns = Ns Sy 5000 Pq u64
-This sets the minimum delay in nanoseconds ZIL care to delay block commit,
-waiting for more records.
-If ZIL writes are too fast, kernel may not be able sleep for so short interval,
-increasing log latency above allowed by
-.Sy zfs_commit_timeout_pct .
-.
 .It Sy zil_nocacheflush Ns = Ns Sy 0 Ns | Ns 1 Pq int
 Disable the cache flush commands that are normally sent to disk by
 the ZIL after an LWB write has completed.
index a11886136994b797cc3be737893702b3013dd572..218031a8af4bc49fa944517cccb5487fc4ddd50c 100644 (file)
  * committed to stable storage. Please refer to the zil_commit_waiter()
  * function (and the comments within it) for more details.
  */
-static uint_t zfs_commit_timeout_pct = 5;
-
-/*
- * Minimal time we care to delay commit waiting for more ZIL records.
- * At least FreeBSD kernel can't sleep for less than 2us at its best.
- * So requests to sleep for less then 5us is a waste of CPU time with
- * a risk of significant log latency increase due to oversleep.
- */
-static uint64_t zil_min_commit_timeout = 5000;
+static uint_t zfs_commit_timeout_pct = 10;
 
 /*
  * See zil.h for more information about these fields.
@@ -2696,6 +2688,19 @@ zil_commit_writer_stall(zilog_t *zilog)
        ASSERT(list_is_empty(&zilog->zl_lwb_list));
 }
 
+static void
+zil_burst_done(zilog_t *zilog)
+{
+       if (!list_is_empty(&zilog->zl_itx_commit_list) ||
+           zilog->zl_cur_used == 0)
+               return;
+
+       if (zilog->zl_parallel)
+               zilog->zl_parallel--;
+
+       zilog->zl_cur_used = 0;
+}
+
 /*
  * This function will traverse the commit list, creating new lwbs as
  * needed, and committing the itxs from the commit list to these newly
@@ -2710,7 +2715,6 @@ zil_process_commit_list(zilog_t *zilog, zil_commit_waiter_t *zcw, list_t *ilwbs)
        list_t nolwb_waiters;
        lwb_t *lwb, *plwb;
        itx_t *itx;
-       boolean_t first = B_TRUE;
 
        ASSERT(MUTEX_HELD(&zilog->zl_issuer_lock));
 
@@ -2736,9 +2740,22 @@ zil_process_commit_list(zilog_t *zilog, zil_commit_waiter_t *zcw, list_t *ilwbs)
                zil_commit_activate_saxattr_feature(zilog);
                ASSERT(lwb->lwb_state == LWB_STATE_NEW ||
                    lwb->lwb_state == LWB_STATE_OPENED);
-               first = (lwb->lwb_state == LWB_STATE_NEW) &&
-                   ((plwb = list_prev(&zilog->zl_lwb_list, lwb)) == NULL ||
-                   plwb->lwb_state == LWB_STATE_FLUSH_DONE);
+
+               /*
+                * If the lwb is still opened, it means the workload is really
+                * multi-threaded and we won the chance of write aggregation.
+                * If it is not opened yet, but previous lwb is still not
+                * flushed, it still means the workload is multi-threaded, but
+                * there was too much time between the commits to aggregate, so
+                * we try aggregation next times, but without too much hopes.
+                */
+               if (lwb->lwb_state == LWB_STATE_OPENED) {
+                       zilog->zl_parallel = ZIL_BURSTS;
+               } else if ((plwb = list_prev(&zilog->zl_lwb_list, lwb))
+                   != NULL && plwb->lwb_state != LWB_STATE_FLUSH_DONE) {
+                       zilog->zl_parallel = MAX(zilog->zl_parallel,
+                           ZIL_BURSTS / 2);
+               }
        }
 
        while ((itx = list_remove_head(&zilog->zl_itx_commit_list)) != NULL) {
@@ -2813,7 +2830,7 @@ zil_process_commit_list(zilog_t *zilog, zil_commit_waiter_t *zcw, list_t *ilwbs)
                                         * Our lwb is done, leave the rest of
                                         * itx list to somebody else who care.
                                         */
-                                       first = B_FALSE;
+                                       zilog->zl_parallel = ZIL_BURSTS;
                                        break;
                                }
                        } else {
@@ -2905,28 +2922,15 @@ zil_process_commit_list(zilog_t *zilog, zil_commit_waiter_t *zcw, list_t *ilwbs)
                 * try and pack as many itxs into as few lwbs as
                 * possible, without significantly impacting the latency
                 * of each individual itx.
-                *
-                * If we had no already running or open LWBs, it can be
-                * the workload is single-threaded.  And if the ZIL write
-                * latency is very small or if the LWB is almost full, it
-                * may be cheaper to bypass the delay.
                 */
-               if (lwb->lwb_state == LWB_STATE_OPENED && first) {
-                       hrtime_t sleep = zilog->zl_last_lwb_latency *
-                           zfs_commit_timeout_pct / 100;
-                       if (sleep < zil_min_commit_timeout ||
-                           lwb->lwb_nmax - lwb->lwb_nused <
-                           lwb->lwb_nmax / 8) {
-                               list_insert_tail(ilwbs, lwb);
-                               lwb = zil_lwb_write_close(zilog, lwb,
-                                   LWB_STATE_NEW);
-                               zilog->zl_cur_used = 0;
-                               if (lwb == NULL) {
-                                       while ((lwb = list_remove_head(ilwbs))
-                                           != NULL)
-                                               zil_lwb_write_issue(zilog, lwb);
-                                       zil_commit_writer_stall(zilog);
-                               }
+               if (lwb->lwb_state == LWB_STATE_OPENED && !zilog->zl_parallel) {
+                       list_insert_tail(ilwbs, lwb);
+                       lwb = zil_lwb_write_close(zilog, lwb, LWB_STATE_NEW);
+                       zil_burst_done(zilog);
+                       if (lwb == NULL) {
+                               while ((lwb = list_remove_head(ilwbs)) != NULL)
+                                       zil_lwb_write_issue(zilog, lwb);
+                               zil_commit_writer_stall(zilog);
                        }
                }
        }
@@ -3084,19 +3088,7 @@ zil_commit_waiter_timeout(zilog_t *zilog, zil_commit_waiter_t *zcw)
 
        ASSERT3S(lwb->lwb_state, ==, LWB_STATE_CLOSED);
 
-       /*
-        * Since the lwb's zio hadn't been issued by the time this thread
-        * reached its timeout, we reset the zilog's "zl_cur_used" field
-        * to influence the zil block size selection algorithm.
-        *
-        * By having to issue the lwb's zio here, it means the size of the
-        * lwb was too large, given the incoming throughput of itxs.  By
-        * setting "zl_cur_used" to zero, we communicate this fact to the
-        * block size selection algorithm, so it can take this information
-        * into account, and potentially select a smaller size for the
-        * next lwb block that is allocated.
-        */
-       zilog->zl_cur_used = 0;
+       zil_burst_done(zilog);
 
        if (nlwb == NULL) {
                /*
@@ -4214,9 +4206,6 @@ EXPORT_SYMBOL(zil_kstat_values_update);
 ZFS_MODULE_PARAM(zfs, zfs_, commit_timeout_pct, UINT, ZMOD_RW,
        "ZIL block open timeout percentage");
 
-ZFS_MODULE_PARAM(zfs_zil, zil_, min_commit_timeout, U64, ZMOD_RW,
-       "Minimum delay we care for ZIL block commit");
-
 ZFS_MODULE_PARAM(zfs_zil, zil_, replay_disable, INT, ZMOD_RW,
        "Disable intent logging replay");