/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
* Portions Copyright 2011 Martin Matuska
- * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2017 by Delphix. All rights reserved.
*/
#include <sys/zfs_context.h>
#include <sys/dmu_tx.h>
#include <sys/dsl_pool.h>
#include <sys/dsl_scan.h>
+#include <sys/zil.h>
#include <sys/callb.h>
#include <sys/trace_txg.h>
* now transition to the syncing state.
*/
-static void txg_sync_thread(dsl_pool_t *dp);
-static void txg_quiesce_thread(dsl_pool_t *dp);
+static void txg_sync_thread(void *arg);
+static void txg_quiesce_thread(void *arg);
int zfs_txg_timeout = 5; /* max seconds worth of delta per txg */
int i;
mutex_init(&tx->tx_cpu[c].tc_lock, NULL, MUTEX_DEFAULT, NULL);
- mutex_init(&tx->tx_cpu[c].tc_open_lock, NULL, MUTEX_DEFAULT,
+ mutex_init(&tx->tx_cpu[c].tc_open_lock, NULL, MUTEX_NOLOCKDEP,
NULL);
for (i = 0; i < TXG_SIZE; i++) {
cv_init(&tx->tx_cpu[c].tc_cv[i], NULL, CV_DEFAULT,
tx_state_t *tx = &dp->dp_tx;
int c;
- ASSERT(tx->tx_threads == 0);
+ ASSERT0(tx->tx_threads);
mutex_destroy(&tx->tx_sync_lock);
dprintf("pool %p\n", dp);
- ASSERT(tx->tx_threads == 0);
+ ASSERT0(tx->tx_threads);
tx->tx_threads = 2;
tx->tx_quiesce_thread = thread_create(NULL, 0, txg_quiesce_thread,
- dp, 0, &p0, TS_RUN, minclsyspri);
+ dp, 0, &p0, TS_RUN, defclsyspri);
/*
* The sync thread can need a larger-than-default stack size on
* 32-bit x86. This is due in part to nested pools and
* scrub_visitbp() recursion.
*/
- tx->tx_sync_thread = thread_create(NULL, 32<<10, txg_sync_thread,
- dp, 0, &p0, TS_RUN, minclsyspri);
+ tx->tx_sync_thread = thread_create(NULL, 0, txg_sync_thread,
+ dp, 0, &p0, TS_RUN, defclsyspri);
mutex_exit(&tx->tx_sync_lock);
}
{
CALLB_CPR_SAFE_BEGIN(cpr);
- if (time)
- (void) cv_timedwait_interruptible(cv, &tx->tx_sync_lock,
+ /*
+ * cv_wait_sig() is used instead of cv_wait() in order to prevent
+ * this process from incorrectly contributing to the system load
+ * average when idle.
+ */
+ if (time) {
+ (void) cv_timedwait_sig(cv, &tx->tx_sync_lock,
ddi_get_lbolt() + time);
- else
- cv_wait_interruptible(cv, &tx->tx_sync_lock);
+ } else {
+ cv_wait_sig(cv, &tx->tx_sync_lock);
+ }
CALLB_CPR_SAFE_END(cpr, &tx->tx_sync_lock);
}
/*
* Finish off any work in progress.
*/
- ASSERT(tx->tx_threads == 2);
+ ASSERT3U(tx->tx_threads, ==, 2);
/*
* We need to ensure that we've vacated the deferred space_maps.
*/
mutex_enter(&tx->tx_sync_lock);
- ASSERT(tx->tx_threads == 2);
+ ASSERT3U(tx->tx_threads, ==, 2);
tx->tx_exiting = 1;
txg_quiesce(dsl_pool_t *dp, uint64_t txg)
{
tx_state_t *tx = &dp->dp_tx;
+ uint64_t tx_open_time;
int g = txg & TXG_MASK;
int c;
ASSERT(txg == tx->tx_open_txg);
tx->tx_open_txg++;
- tx->tx_open_time = gethrtime();
-
- spa_txg_history_set(dp->dp_spa, txg, TXG_STATE_OPEN, tx->tx_open_time);
- spa_txg_history_add(dp->dp_spa, tx->tx_open_txg, tx->tx_open_time);
+ tx->tx_open_time = tx_open_time = gethrtime();
DTRACE_PROBE2(txg__quiescing, dsl_pool_t *, dp, uint64_t, txg);
DTRACE_PROBE2(txg__opened, dsl_pool_t *, dp, uint64_t, tx->tx_open_txg);
for (c = 0; c < max_ncpus; c++)
mutex_exit(&tx->tx_cpu[c].tc_open_lock);
+ spa_txg_history_set(dp->dp_spa, txg, TXG_STATE_OPEN, tx_open_time);
+ spa_txg_history_add(dp->dp_spa, txg + 1, tx_open_time);
+
/*
* Quiesce the transaction group by waiting for everyone to txg_exit().
*/
* Commit callback taskq hasn't been created yet.
*/
tx->tx_commit_cb_taskq = taskq_create("tx_commit_cb",
- 100, minclsyspri, max_ncpus, INT_MAX,
- TASKQ_THREADS_CPU_PCT | TASKQ_PREPOPULATE);
+ max_ncpus, defclsyspri, max_ncpus, max_ncpus * 2,
+ TASKQ_PREPOPULATE | TASKQ_DYNAMIC);
}
cb_list = kmem_alloc(sizeof (list_t), KM_SLEEP);
taskq_wait_outstanding(tx->tx_commit_cb_taskq, 0);
}
+static boolean_t
+txg_is_syncing(dsl_pool_t *dp)
+{
+ tx_state_t *tx = &dp->dp_tx;
+ ASSERT(MUTEX_HELD(&tx->tx_sync_lock));
+ return (tx->tx_syncing_txg != 0);
+}
+
+static boolean_t
+txg_is_quiescing(dsl_pool_t *dp)
+{
+ tx_state_t *tx = &dp->dp_tx;
+ ASSERT(MUTEX_HELD(&tx->tx_sync_lock));
+ return (tx->tx_quiescing_txg != 0);
+}
+
+static boolean_t
+txg_has_quiesced_to_sync(dsl_pool_t *dp)
+{
+ tx_state_t *tx = &dp->dp_tx;
+ ASSERT(MUTEX_HELD(&tx->tx_sync_lock));
+ return (tx->tx_quiesced_txg != 0);
+}
+
static void
-txg_sync_thread(dsl_pool_t *dp)
+txg_sync_thread(void *arg)
{
+ dsl_pool_t *dp = arg;
spa_t *spa = dp->dp_spa;
tx_state_t *tx = &dp->dp_tx;
callb_cpr_t cpr;
- vdev_stat_t *vs1, *vs2;
clock_t start, delta;
(void) spl_fstrans_mark();
txg_thread_enter(tx, &cpr);
- vs1 = kmem_alloc(sizeof (vdev_stat_t), KM_SLEEP);
- vs2 = kmem_alloc(sizeof (vdev_stat_t), KM_SLEEP);
-
start = delta = 0;
for (;;) {
- clock_t timer, timeout;
+ clock_t timeout = zfs_txg_timeout * hz;
+ clock_t timer;
uint64_t txg;
- uint64_t ndirty;
-
- timeout = zfs_txg_timeout * hz;
+ uint64_t dirty_min_bytes =
+ zfs_dirty_data_max * zfs_dirty_data_sync_percent / 100;
/*
* We sync when we're scanning, there's someone waiting
while (!dsl_scan_active(dp->dp_scan) &&
!tx->tx_exiting && timer > 0 &&
tx->tx_synced_txg >= tx->tx_sync_txg_waiting &&
- tx->tx_quiesced_txg == 0 &&
- dp->dp_dirty_total < zfs_dirty_data_sync) {
+ !txg_has_quiesced_to_sync(dp) &&
+ dp->dp_dirty_total < dirty_min_bytes) {
dprintf("waiting; tx_synced=%llu waiting=%llu dp=%p\n",
tx->tx_synced_txg, tx->tx_sync_txg_waiting, dp);
txg_thread_wait(tx, &cpr, &tx->tx_sync_more_cv, timer);
* Wait until the quiesce thread hands off a txg to us,
* prompting it to do so if necessary.
*/
- while (!tx->tx_exiting && tx->tx_quiesced_txg == 0) {
+ while (!tx->tx_exiting && !txg_has_quiesced_to_sync(dp)) {
if (tx->tx_quiesce_txg_waiting < tx->tx_open_txg+1)
tx->tx_quiesce_txg_waiting = tx->tx_open_txg+1;
cv_broadcast(&tx->tx_quiesce_more_cv);
txg_thread_wait(tx, &cpr, &tx->tx_quiesce_done_cv, 0);
}
- if (tx->tx_exiting) {
- kmem_free(vs2, sizeof (vdev_stat_t));
- kmem_free(vs1, sizeof (vdev_stat_t));
+ if (tx->tx_exiting)
txg_thread_exit(tx, &cpr, &tx->tx_sync_thread);
- }
-
- spa_config_enter(spa, SCL_ALL, FTAG, RW_READER);
- vdev_get_stats(spa->spa_root_vdev, vs1);
- spa_config_exit(spa, SCL_ALL, FTAG);
/*
* Consume the quiesced txg which has been handed off to
* us. This may cause the quiescing thread to now be
* able to quiesce another txg, so we must signal it.
*/
+ ASSERT(tx->tx_quiesced_txg != 0);
txg = tx->tx_quiesced_txg;
tx->tx_quiesced_txg = 0;
tx->tx_syncing_txg = txg;
txg, tx->tx_quiesce_txg_waiting, tx->tx_sync_txg_waiting);
mutex_exit(&tx->tx_sync_lock);
- spa_txg_history_set(spa, txg, TXG_STATE_WAIT_FOR_SYNC,
- gethrtime());
- ndirty = dp->dp_dirty_pertxg[txg & TXG_MASK];
-
+ txg_stat_t *ts = spa_txg_history_init_io(spa, txg, dp);
start = ddi_get_lbolt();
spa_sync(spa, txg);
delta = ddi_get_lbolt() - start;
+ spa_txg_history_fini_io(spa, ts);
mutex_enter(&tx->tx_sync_lock);
tx->tx_synced_txg = txg;
* Dispatch commit callbacks to worker threads.
*/
txg_dispatch_callbacks(dp, txg);
-
- spa_config_enter(spa, SCL_ALL, FTAG, RW_READER);
- vdev_get_stats(spa->spa_root_vdev, vs2);
- spa_config_exit(spa, SCL_ALL, FTAG);
- spa_txg_history_set_io(spa, txg,
- vs2->vs_bytes[ZIO_TYPE_READ]-vs1->vs_bytes[ZIO_TYPE_READ],
- vs2->vs_bytes[ZIO_TYPE_WRITE]-vs1->vs_bytes[ZIO_TYPE_WRITE],
- vs2->vs_ops[ZIO_TYPE_READ]-vs1->vs_ops[ZIO_TYPE_READ],
- vs2->vs_ops[ZIO_TYPE_WRITE]-vs1->vs_ops[ZIO_TYPE_WRITE],
- ndirty);
- spa_txg_history_set(spa, txg, TXG_STATE_SYNCED, gethrtime());
}
}
static void
-txg_quiesce_thread(dsl_pool_t *dp)
+txg_quiesce_thread(void *arg)
{
+ dsl_pool_t *dp = arg;
tx_state_t *tx = &dp->dp_tx;
callb_cpr_t cpr;
*/
while (!tx->tx_exiting &&
(tx->tx_open_txg >= tx->tx_quiesce_txg_waiting ||
- tx->tx_quiesced_txg != 0))
+ txg_has_quiesced_to_sync(dp)))
txg_thread_wait(tx, &cpr, &tx->tx_quiesce_more_cv, 0);
if (tx->tx_exiting)
dprintf("txg=%llu quiesce_txg=%llu sync_txg=%llu\n",
txg, tx->tx_quiesce_txg_waiting,
tx->tx_sync_txg_waiting);
+ tx->tx_quiescing_txg = txg;
+
mutex_exit(&tx->tx_sync_lock);
txg_quiesce(dp, txg);
mutex_enter(&tx->tx_sync_lock);
* Hand this txg off to the sync thread.
*/
dprintf("quiesce done, handing off txg %llu\n", txg);
+ tx->tx_quiescing_txg = 0;
tx->tx_quiesced_txg = txg;
DTRACE_PROBE2(txg__quiesced, dsl_pool_t *, dp, uint64_t, txg);
cv_broadcast(&tx->tx_sync_more_cv);
ASSERT(!dsl_pool_config_held(dp));
mutex_enter(&tx->tx_sync_lock);
- ASSERT(tx->tx_threads == 2);
+ ASSERT3U(tx->tx_threads, ==, 2);
if (txg == 0)
txg = tx->tx_open_txg + TXG_DEFER_SIZE;
if (tx->tx_sync_txg_waiting < txg)
"tx_synced=%llu waiting=%llu dp=%p\n",
tx->tx_synced_txg, tx->tx_sync_txg_waiting, dp);
cv_broadcast(&tx->tx_sync_more_cv);
- cv_wait(&tx->tx_sync_done_cv, &tx->tx_sync_lock);
+ cv_wait_io(&tx->tx_sync_done_cv, &tx->tx_sync_lock);
}
mutex_exit(&tx->tx_sync_lock);
}
+/*
+ * Wait for the specified open transaction group. Set should_quiesce
+ * when the current open txg should be quiesced immediately.
+ */
void
-txg_wait_open(dsl_pool_t *dp, uint64_t txg)
+txg_wait_open(dsl_pool_t *dp, uint64_t txg, boolean_t should_quiesce)
{
tx_state_t *tx = &dp->dp_tx;
ASSERT(!dsl_pool_config_held(dp));
mutex_enter(&tx->tx_sync_lock);
- ASSERT(tx->tx_threads == 2);
+ ASSERT3U(tx->tx_threads, ==, 2);
if (txg == 0)
txg = tx->tx_open_txg + 1;
- if (tx->tx_quiesce_txg_waiting < txg)
+ if (tx->tx_quiesce_txg_waiting < txg && should_quiesce)
tx->tx_quiesce_txg_waiting = txg;
dprintf("txg=%llu quiesce_txg=%llu sync_txg=%llu\n",
txg, tx->tx_quiesce_txg_waiting, tx->tx_sync_txg_waiting);
while (tx->tx_open_txg < txg) {
cv_broadcast(&tx->tx_quiesce_more_cv);
- cv_wait(&tx->tx_quiesce_done_cv, &tx->tx_sync_lock);
+ /*
+ * Callers setting should_quiesce will use cv_wait_io() and
+ * be accounted for as iowait time. Otherwise, the caller is
+ * understood to be idle and cv_wait_sig() is used to prevent
+ * incorrectly inflating the system load average.
+ */
+ if (should_quiesce == B_TRUE) {
+ cv_wait_io(&tx->tx_quiesce_done_cv, &tx->tx_sync_lock);
+ } else {
+ cv_wait_sig(&tx->tx_quiesce_done_cv, &tx->tx_sync_lock);
+ }
}
mutex_exit(&tx->tx_sync_lock);
}
ASSERT(!dsl_pool_config_held(dp));
mutex_enter(&tx->tx_sync_lock);
- if (tx->tx_syncing_txg == 0 &&
+ if (!txg_is_syncing(dp) &&
+ !txg_is_quiescing(dp) &&
tx->tx_quiesce_txg_waiting <= tx->tx_open_txg &&
tx->tx_sync_txg_waiting <= tx->tx_synced_txg &&
tx->tx_quiesced_txg <= tx->tx_synced_txg) {
tx->tx_quiesced_txg != 0);
}
+/*
+ * Verify that this txg is active (open, quiescing, syncing). Non-active
+ * txg's should not be manipulated.
+ */
+#ifdef ZFS_DEBUG
+void
+txg_verify(spa_t *spa, uint64_t txg)
+{
+ ASSERTV(dsl_pool_t *dp = spa_get_dsl(spa));
+ if (txg <= TXG_INITIAL || txg == ZILTEST_TXG)
+ return;
+ ASSERT3U(txg, <=, dp->dp_tx.tx_open_txg);
+ ASSERT3U(txg, >=, dp->dp_tx.tx_synced_txg);
+ ASSERT3U(txg, >=, dp->dp_tx.tx_open_txg - TXG_CONCURRENT_STATES);
+}
+#endif
+
/*
* Per-txg object lists.
*/
void
-txg_list_create(txg_list_t *tl, size_t offset)
+txg_list_create(txg_list_t *tl, spa_t *spa, size_t offset)
{
int t;
mutex_init(&tl->tl_lock, NULL, MUTEX_DEFAULT, NULL);
tl->tl_offset = offset;
+ tl->tl_spa = spa;
for (t = 0; t < TXG_SIZE; t++)
tl->tl_head[t] = NULL;
}
+static boolean_t
+txg_list_empty_impl(txg_list_t *tl, uint64_t txg)
+{
+ ASSERT(MUTEX_HELD(&tl->tl_lock));
+ TXG_VERIFY(tl->tl_spa, txg);
+ return (tl->tl_head[txg & TXG_MASK] == NULL);
+}
+
+boolean_t
+txg_list_empty(txg_list_t *tl, uint64_t txg)
+{
+ mutex_enter(&tl->tl_lock);
+ boolean_t ret = txg_list_empty_impl(tl, txg);
+ mutex_exit(&tl->tl_lock);
+
+ return (ret);
+}
+
void
txg_list_destroy(txg_list_t *tl)
{
int t;
+ mutex_enter(&tl->tl_lock);
for (t = 0; t < TXG_SIZE; t++)
- ASSERT(txg_list_empty(tl, t));
+ ASSERT(txg_list_empty_impl(tl, t));
+ mutex_exit(&tl->tl_lock);
mutex_destroy(&tl->tl_lock);
}
-boolean_t
-txg_list_empty(txg_list_t *tl, uint64_t txg)
-{
- return (tl->tl_head[txg & TXG_MASK] == NULL);
-}
-
/*
* Returns true if all txg lists are empty.
*
* Warning: this is inherently racy (an item could be added immediately
- * after this function returns). We don't bother with the lock because
- * it wouldn't change the semantics.
+ * after this function returns).
*/
boolean_t
txg_all_lists_empty(txg_list_t *tl)
{
- int i;
-
- for (i = 0; i < TXG_SIZE; i++) {
- if (!txg_list_empty(tl, i)) {
+ mutex_enter(&tl->tl_lock);
+ for (int i = 0; i < TXG_SIZE; i++) {
+ if (!txg_list_empty_impl(tl, i)) {
+ mutex_exit(&tl->tl_lock);
return (B_FALSE);
}
}
+ mutex_exit(&tl->tl_lock);
return (B_TRUE);
}
txg_node_t *tn = (txg_node_t *)((char *)p + tl->tl_offset);
boolean_t add;
+ TXG_VERIFY(tl->tl_spa, txg);
mutex_enter(&tl->tl_lock);
add = (tn->tn_member[t] == 0);
if (add) {
txg_node_t *tn = (txg_node_t *)((char *)p + tl->tl_offset);
boolean_t add;
+ TXG_VERIFY(tl->tl_spa, txg);
mutex_enter(&tl->tl_lock);
add = (tn->tn_member[t] == 0);
if (add) {
txg_node_t *tn;
void *p = NULL;
+ TXG_VERIFY(tl->tl_spa, txg);
mutex_enter(&tl->tl_lock);
if ((tn = tl->tl_head[t]) != NULL) {
+ ASSERT(tn->tn_member[t]);
+ ASSERT(tn->tn_next[t] == NULL || tn->tn_next[t]->tn_member[t]);
p = (char *)tn - tl->tl_offset;
tl->tl_head[t] = tn->tn_next[t];
tn->tn_next[t] = NULL;
int t = txg & TXG_MASK;
txg_node_t *tn, **tp;
+ TXG_VERIFY(tl->tl_spa, txg);
mutex_enter(&tl->tl_lock);
for (tp = &tl->tl_head[t]; (tn = *tp) != NULL; tp = &tn->tn_next[t]) {
int t = txg & TXG_MASK;
txg_node_t *tn = (txg_node_t *)((char *)p + tl->tl_offset);
+ TXG_VERIFY(tl->tl_spa, txg);
return (tn->tn_member[t] != 0);
}
/*
- * Walk a txg list -- only safe if you know it's not changing.
+ * Walk a txg list
*/
void *
txg_list_head(txg_list_t *tl, uint64_t txg)
{
int t = txg & TXG_MASK;
- txg_node_t *tn = tl->tl_head[t];
+ txg_node_t *tn;
+
+ mutex_enter(&tl->tl_lock);
+ tn = tl->tl_head[t];
+ mutex_exit(&tl->tl_lock);
+ TXG_VERIFY(tl->tl_spa, txg);
return (tn == NULL ? NULL : (char *)tn - tl->tl_offset);
}
int t = txg & TXG_MASK;
txg_node_t *tn = (txg_node_t *)((char *)p + tl->tl_offset);
+ TXG_VERIFY(tl->tl_spa, txg);
+
+ mutex_enter(&tl->tl_lock);
tn = tn->tn_next[t];
+ mutex_exit(&tl->tl_lock);
return (tn == NULL ? NULL : (char *)tn - tl->tl_offset);
}
-#if defined(_KERNEL) && defined(HAVE_SPL)
+#if defined(_KERNEL)
EXPORT_SYMBOL(txg_init);
EXPORT_SYMBOL(txg_fini);
EXPORT_SYMBOL(txg_sync_start);