]> git.proxmox.com Git - mirror_zfs.git/commitdiff
OpenZFS 6569 - large file delete can starve out write ops
authorGeorge Melikov <mail@gmelikov.ru>
Tue, 31 Jan 2017 22:44:03 +0000 (01:44 +0300)
committerBrian Behlendorf <behlendorf1@llnl.gov>
Tue, 31 Jan 2017 22:44:03 +0000 (14:44 -0800)
Authored by: Alek Pinchuk <alek@nexenta.com>
Reviewed by: Matt Ahrens <mahrens@delphix.com>
Reviewed by: Sanjay Nadkarni <sanjay.nadkarni@nexenta.com>
Reviewed by: Pavel Zakharov <pavel.zakharov@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Ported-by: George Melikov <mail@gmelikov.ru>
Tested-by: kernelOfTruth <kerneloftruth@gmail.com>
OpenZFS-issue: https://www.illumos.org/issues/6569
OpenZFS-commit: https://github.com/openzfs/openzfs/commit/1bf4b6f2
Closes #5706

include/sys/dsl_pool.h
include/sys/trace_dmu.h
module/zfs/dmu.c
module/zfs/dsl_pool.c

index 48b12e8eb1346236288f71cf198f6c45023456c7..b509d312b628e5b48a1529c6ed2275eb0b818198 100644 (file)
@@ -21,6 +21,7 @@
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2013 by Delphix. All rights reserved.
+ * Copyright 2016 Nexenta Systems, Inc.  All rights reserved.
  */
 
 #ifndef        _SYS_DSL_POOL_H
@@ -106,6 +107,7 @@ typedef struct dsl_pool {
        kcondvar_t dp_spaceavail_cv;
        uint64_t dp_dirty_pertxg[TXG_SIZE];
        uint64_t dp_dirty_total;
+       uint64_t dp_long_free_dirty_pertxg[TXG_SIZE];
        uint64_t dp_mos_used_delta;
        uint64_t dp_mos_compressed_delta;
        uint64_t dp_mos_uncompressed_delta;
index 916c9bdbae657583c340a4e3487afb42610d07fd..b2f37a6be4c1615a43a8a1dfa0647a16ede56f13 100644 (file)
@@ -112,6 +112,36 @@ DEFINE_EVENT(zfs_delay_mintime_class, name, \
 /* END CSTYLED */
 DEFINE_DELAY_MINTIME_EVENT(zfs_delay__mintime);
 
+/* BEGIN CSTYLED */
+DECLARE_EVENT_CLASS(zfs_free_long_range_class,
+       TP_PROTO(uint64_t long_free_dirty_all_txgs, uint64_t chunk_len, \
+           uint64_t txg),
+       TP_ARGS(long_free_dirty_all_txgs, chunk_len, txg),
+       TP_STRUCT__entry(
+           __field(uint64_t,                   long_free_dirty_all_txgs)
+           __field(uint64_t,                   chunk_len)
+           __field(uint64_t,                   txg)
+       ),
+       TP_fast_assign(
+           __entry->long_free_dirty_all_txgs   = long_free_dirty_all_txgs;
+           __entry->chunk_len                                  = chunk_len;
+           __entry->txg                                                = txg;
+       ),
+       TP_printk("long_free_dirty_all_txgs %llu chunk_len %llu txg %llu",
+          __entry->long_free_dirty_all_txgs,
+          __entry->chunk_len, __entry->txg)
+);
+/* END CSTYLED */
+
+/* BEGIN CSTYLED */
+#define        DEFINE_FREE_LONG_RANGE_EVENT(name) \
+DEFINE_EVENT(zfs_free_long_range_class, name, \
+       TP_PROTO(uint64_t long_free_dirty_all_txgs, \
+           uint64_t chunk_len, uint64_t txg), \
+       TP_ARGS(long_free_dirty_all_txgs, chunk_len, txg))
+/* END CSTYLED */
+DEFINE_FREE_LONG_RANGE_EVENT(zfs_free__long__range);
+
 #endif /* _TRACE_DMU_H */
 
 #undef TRACE_INCLUDE_PATH
index cdbcfe2505c3c59caeb7455c806a0a2053e78b42..b0bceac2548c4a49a80d2b5a67b171f14a06565b 100644 (file)
@@ -48,6 +48,7 @@
 #include <sys/sa.h>
 #include <sys/zfeature.h>
 #include <sys/abd.h>
+#include <sys/trace_dmu.h>
 #ifdef _KERNEL
 #include <sys/vmsystm.h>
 #include <sys/zfs_znode.h>
  */
 int zfs_nopwrite_enabled = 1;
 
+/*
+ * Tunable to control percentage of dirtied blocks from frees in one TXG.
+ * After this threshold is crossed, additional dirty blocks from frees
+ * wait until the next TXG.
+ * A value of zero will disable this throttle.
+ */
+uint32_t zfs_per_txg_dirty_frees_percent = 30;
+
 const dmu_object_type_info_t dmu_ot[DMU_OT_NUMTYPES] = {
        {       DMU_BSWAP_UINT8,        TRUE,   "unallocated"           },
        {       DMU_BSWAP_ZAP,          TRUE,   "object directory"      },
@@ -727,6 +736,9 @@ dmu_free_long_range_impl(objset_t *os, dnode_t *dn, uint64_t offset,
 {
        uint64_t object_size;
        int err;
+       uint64_t dirty_frees_threshold;
+       dsl_pool_t *dp = dmu_objset_pool(os);
+       int t;
 
        if (dn == NULL)
                return (SET_ERROR(EINVAL));
@@ -735,11 +747,18 @@ dmu_free_long_range_impl(objset_t *os, dnode_t *dn, uint64_t offset,
        if (offset >= object_size)
                return (0);
 
+       if (zfs_per_txg_dirty_frees_percent <= 100)
+               dirty_frees_threshold =
+                   zfs_per_txg_dirty_frees_percent * zfs_dirty_data_max / 100;
+       else
+               dirty_frees_threshold = zfs_dirty_data_max / 4;
+
        if (length == DMU_OBJECT_END || offset + length > object_size)
                length = object_size - offset;
 
        while (length != 0) {
-               uint64_t chunk_end, chunk_begin;
+               uint64_t chunk_end, chunk_begin, chunk_len;
+               uint64_t long_free_dirty_all_txgs = 0;
                dmu_tx_t *tx;
 
                if (dmu_objset_zfs_unmounting(dn->dn_objset))
@@ -754,9 +773,28 @@ dmu_free_long_range_impl(objset_t *os, dnode_t *dn, uint64_t offset,
                ASSERT3U(chunk_begin, >=, offset);
                ASSERT3U(chunk_begin, <=, chunk_end);
 
+               chunk_len = chunk_end - chunk_begin;
+
+               mutex_enter(&dp->dp_lock);
+               for (t = 0; t < TXG_SIZE; t++) {
+                       long_free_dirty_all_txgs +=
+                           dp->dp_long_free_dirty_pertxg[t];
+               }
+               mutex_exit(&dp->dp_lock);
+
+               /*
+                * To avoid filling up a TXG with just frees wait for
+                * the next TXG to open before freeing more chunks if
+                * we have reached the threshold of frees
+                */
+               if (dirty_frees_threshold != 0 &&
+                   long_free_dirty_all_txgs >= dirty_frees_threshold) {
+                       txg_wait_open(dp, 0);
+                       continue;
+               }
+
                tx = dmu_tx_create(os);
-               dmu_tx_hold_free(tx, dn->dn_object,
-                   chunk_begin, chunk_end - chunk_begin);
+               dmu_tx_hold_free(tx, dn->dn_object, chunk_begin, chunk_len);
 
                /*
                 * Mark this transaction as typically resulting in a net
@@ -768,10 +806,18 @@ dmu_free_long_range_impl(objset_t *os, dnode_t *dn, uint64_t offset,
                        dmu_tx_abort(tx);
                        return (err);
                }
-               dnode_free_range(dn, chunk_begin, chunk_end - chunk_begin, tx);
+
+               mutex_enter(&dp->dp_lock);
+               dp->dp_long_free_dirty_pertxg[dmu_tx_get_txg(tx) & TXG_MASK] +=
+                   chunk_len;
+               mutex_exit(&dp->dp_lock);
+               DTRACE_PROBE3(free__long__range,
+                   uint64_t, long_free_dirty_all_txgs, uint64_t, chunk_len,
+                   uint64_t, dmu_tx_get_txg(tx));
+               dnode_free_range(dn, chunk_begin, chunk_len, tx);
                dmu_tx_commit(tx);
 
-               length -= chunk_end - chunk_begin;
+               length -= chunk_len;
        }
        return (0);
 }
index 2ff3ae45681ff1cee85fd17d7b64adc10b5aedb0..1b8b780aa40175231fc9c6ce87861f818950cc3c 100644 (file)
@@ -23,6 +23,7 @@
  * Copyright (c) 2011, 2015 by Delphix. All rights reserved.
  * Copyright (c) 2013 Steven Hartland. All rights reserved.
  * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
+ * Copyright 2016 Nexenta Systems, Inc.  All rights reserved.
  */
 
 #include <sys/dsl_pool.h>
@@ -509,6 +510,16 @@ dsl_pool_sync(dsl_pool_t *dp, uint64_t txg)
         */
        dsl_pool_undirty_space(dp, dp->dp_dirty_pertxg[txg & TXG_MASK], txg);
 
+       /*
+        * Update the long range free counter after
+        * we're done syncing user data
+        */
+       mutex_enter(&dp->dp_lock);
+       ASSERT(spa_sync_pass(dp->dp_spa) == 1 ||
+           dp->dp_long_free_dirty_pertxg[txg & TXG_MASK] == 0);
+       dp->dp_long_free_dirty_pertxg[txg & TXG_MASK] = 0;
+       mutex_exit(&dp->dp_lock);
+
        /*
         * After the data blocks have been written (ensured by the zio_wait()
         * above), update the user/group space accounting.