static kmem_cache_t *zil_zcw_cache;
static void zil_lwb_commit(zilog_t *zilog, lwb_t *lwb, itx_t *itx);
-static void zil_lwb_write_issue(zilog_t *zilog, lwb_t *lwb);
static itx_t *zil_itx_clone(itx_t *oitx);
static int
return (TREE_CMP(v1, v2));
}
+/*
+ * Allocate a new lwb. We may already have a block pointer for it, in which
+ * case we get size and version from there. Or we may not yet, in which case
+ * we choose them here and later make the block allocation match.
+ */
static lwb_t *
-zil_alloc_lwb(zilog_t *zilog, blkptr_t *bp, boolean_t slog, uint64_t txg)
+zil_alloc_lwb(zilog_t *zilog, int sz, blkptr_t *bp, boolean_t slog,
+ uint64_t txg, lwb_state_t state)
{
lwb_t *lwb;
lwb = kmem_cache_alloc(zil_lwb_cache, KM_SLEEP);
lwb->lwb_zilog = zilog;
- lwb->lwb_blk = *bp;
+ if (bp) {
+ lwb->lwb_blk = *bp;
+ lwb->lwb_slim = (BP_GET_CHECKSUM(bp) == ZIO_CHECKSUM_ZILOG2);
+ sz = BP_GET_LSIZE(bp);
+ } else {
+ BP_ZERO(&lwb->lwb_blk);
+ lwb->lwb_slim = (spa_version(zilog->zl_spa) >=
+ SPA_VERSION_SLIM_ZIL);
+ }
lwb->lwb_slog = slog;
- lwb->lwb_indirect = B_FALSE;
- if (BP_GET_CHECKSUM(bp) == ZIO_CHECKSUM_ZILOG2) {
+ lwb->lwb_error = 0;
+ if (lwb->lwb_slim) {
+ lwb->lwb_nmax = sz;
lwb->lwb_nused = lwb->lwb_nfilled = sizeof (zil_chain_t);
- lwb->lwb_sz = BP_GET_LSIZE(bp);
} else {
+ lwb->lwb_nmax = sz - sizeof (zil_chain_t);
lwb->lwb_nused = lwb->lwb_nfilled = 0;
- lwb->lwb_sz = BP_GET_LSIZE(bp) - sizeof (zil_chain_t);
}
- lwb->lwb_state = LWB_STATE_CLOSED;
- lwb->lwb_buf = zio_buf_alloc(BP_GET_LSIZE(bp));
+ lwb->lwb_sz = sz;
+ lwb->lwb_state = state;
+ lwb->lwb_buf = zio_buf_alloc(sz);
+ lwb->lwb_child_zio = NULL;
lwb->lwb_write_zio = NULL;
lwb->lwb_root_zio = NULL;
lwb->lwb_issued_timestamp = 0;
lwb->lwb_issued_txg = 0;
- lwb->lwb_max_txg = txg;
+ lwb->lwb_alloc_txg = txg;
+ lwb->lwb_max_txg = 0;
mutex_enter(&zilog->zl_lock);
list_insert_tail(&zilog->zl_lwb_list, lwb);
+ if (state != LWB_STATE_NEW)
+ zilog->zl_last_lwb_opened = lwb;
mutex_exit(&zilog->zl_lock);
return (lwb);
VERIFY(list_is_empty(&lwb->lwb_waiters));
VERIFY(list_is_empty(&lwb->lwb_itxs));
ASSERT(avl_is_empty(&lwb->lwb_vdev_tree));
+ ASSERT3P(lwb->lwb_child_zio, ==, NULL);
ASSERT3P(lwb->lwb_write_zio, ==, NULL);
ASSERT3P(lwb->lwb_root_zio, ==, NULL);
+ ASSERT3U(lwb->lwb_alloc_txg, <=, spa_syncing_txg(zilog->zl_spa));
ASSERT3U(lwb->lwb_max_txg, <=, spa_syncing_txg(zilog->zl_spa));
- ASSERT(lwb->lwb_state == LWB_STATE_CLOSED ||
+ ASSERT(lwb->lwb_state == LWB_STATE_NEW ||
lwb->lwb_state == LWB_STATE_FLUSH_DONE);
/*
* Allocate a log write block (lwb) for the first log block.
*/
if (error == 0)
- lwb = zil_alloc_lwb(zilog, &blk, slog, txg);
+ lwb = zil_alloc_lwb(zilog, 0, &blk, slog, txg, LWB_STATE_NEW);
/*
* If we just allocated the first log block, commit our transaction
while ((lwb = list_remove_head(&zilog->zl_lwb_list)) != NULL) {
if (lwb->lwb_buf != NULL)
zio_buf_free(lwb->lwb_buf, lwb->lwb_sz);
- zio_free(zilog->zl_spa, txg, &lwb->lwb_blk);
+ if (!BP_IS_HOLE(&lwb->lwb_blk))
+ zio_free(zilog->zl_spa, txg, &lwb->lwb_blk);
zil_free_lwb(zilog, lwb);
}
} else if (!keep_first) {
{
/*
* The lwb_waiters field of the lwb is protected by the zilog's
- * zl_lock, thus it must be held when calling this function.
+ * zl_issuer_lock while the lwb is open and zl_lock otherwise.
+ * zl_issuer_lock also protects leaving the open state.
+ * zcw_lwb setting is protected by zl_issuer_lock and state !=
+ * flush_done, which transition is protected by zl_lock.
*/
- ASSERT(MUTEX_HELD(&lwb->lwb_zilog->zl_lock));
+ ASSERT(MUTEX_HELD(&lwb->lwb_zilog->zl_issuer_lock));
+ IMPLY(lwb->lwb_state != LWB_STATE_OPENED,
+ MUTEX_HELD(&lwb->lwb_zilog->zl_lock));
+ ASSERT3S(lwb->lwb_state, !=, LWB_STATE_NEW);
+ ASSERT3S(lwb->lwb_state, !=, LWB_STATE_FLUSH_DONE);
- mutex_enter(&zcw->zcw_lock);
ASSERT(!list_link_active(&zcw->zcw_node));
- ASSERT3P(zcw->zcw_lwb, ==, NULL);
- ASSERT3P(lwb, !=, NULL);
- ASSERT(lwb->lwb_state == LWB_STATE_OPENED ||
- lwb->lwb_state == LWB_STATE_ISSUED ||
- lwb->lwb_state == LWB_STATE_WRITE_DONE);
-
list_insert_tail(&lwb->lwb_waiters, zcw);
+ ASSERT3P(zcw->zcw_lwb, ==, NULL);
zcw->zcw_lwb = lwb;
- mutex_exit(&zcw->zcw_lock);
}
/*
static void
zil_commit_waiter_link_nolwb(zil_commit_waiter_t *zcw, list_t *nolwb)
{
- mutex_enter(&zcw->zcw_lock);
ASSERT(!list_link_active(&zcw->zcw_node));
- ASSERT3P(zcw->zcw_lwb, ==, NULL);
list_insert_tail(nolwb, zcw);
- mutex_exit(&zcw->zcw_lock);
+ ASSERT3P(zcw->zcw_lwb, ==, NULL);
}
void
mutex_enter(&zilog->zl_lock);
mutex_enter(&zilog->zl_lwb_io_lock);
lwb_t *lwb = list_head(&zilog->zl_lwb_list);
- while (lwb != NULL && lwb->lwb_max_txg <= txg) {
+ while (lwb != NULL) {
if (lwb->lwb_issued_txg <= txg) {
ASSERT(lwb->lwb_state != LWB_STATE_ISSUED);
ASSERT(lwb->lwb_state != LWB_STATE_WRITE_DONE);
ASSERT3S(spa_config_held(spa, SCL_STATE, RW_READER), !=, 0);
- ASSERT(BP_GET_COMPRESS(zio->io_bp) == ZIO_COMPRESS_OFF);
- ASSERT(BP_GET_TYPE(zio->io_bp) == DMU_OT_INTENT_LOG);
- ASSERT(BP_GET_LEVEL(zio->io_bp) == 0);
- ASSERT(BP_GET_BYTEORDER(zio->io_bp) == ZFS_HOST_BYTEORDER);
- ASSERT(!BP_IS_GANG(zio->io_bp));
- ASSERT(!BP_IS_HOLE(zio->io_bp));
- ASSERT(BP_GET_FILL(zio->io_bp) == 0);
-
abd_free(zio->io_abd);
zio_buf_free(lwb->lwb_buf, lwb->lwb_sz);
lwb->lwb_buf = NULL;
mutex_enter(&zilog->zl_lock);
ASSERT3S(lwb->lwb_state, ==, LWB_STATE_ISSUED);
lwb->lwb_state = LWB_STATE_WRITE_DONE;
+ lwb->lwb_child_zio = NULL;
lwb->lwb_write_zio = NULL;
nlwb = list_next(&zilog->zl_lwb_list, lwb);
mutex_exit(&zilog->zl_lock);
}
}
+/*
+ * Build the zio dependency chain, which is used to preserve the ordering of
+ * lwb completions that is required by the semantics of the ZIL. Each new lwb
+ * zio becomes a parent of the previous lwb zio, such that the new lwb's zio
+ * cannot complete until the previous lwb's zio completes.
+ *
+ * This is required by the semantics of zil_commit(): the commit waiters
+ * attached to the lwbs will be woken in the lwb zio's completion callback,
+ * so this zio dependency graph ensures the waiters are woken in the correct
+ * order (the same order the lwbs were created).
+ */
static void
zil_lwb_set_zio_dependency(zilog_t *zilog, lwb_t *lwb)
{
- lwb_t *last_lwb_opened = zilog->zl_last_lwb_opened;
-
- ASSERT(MUTEX_HELD(&zilog->zl_issuer_lock));
ASSERT(MUTEX_HELD(&zilog->zl_lock));
+ lwb_t *prev_lwb = list_prev(&zilog->zl_lwb_list, lwb);
+ if (prev_lwb == NULL ||
+ prev_lwb->lwb_state == LWB_STATE_FLUSH_DONE)
+ return;
+
/*
- * The zilog's "zl_last_lwb_opened" field is used to build the
- * lwb/zio dependency chain, which is used to preserve the
- * ordering of lwb completions that is required by the semantics
- * of the ZIL. Each new lwb zio becomes a parent of the
- * "previous" lwb zio, such that the new lwb's zio cannot
- * complete until the "previous" lwb's zio completes.
+ * If the previous lwb's write hasn't already completed, we also want
+ * to order the completion of the lwb write zios (above, we only order
+ * the completion of the lwb root zios). This is required because of
+ * how we can defer the DKIOCFLUSHWRITECACHE commands for each lwb.
+ *
+ * When the DKIOCFLUSHWRITECACHE commands are deferred, the previous
+ * lwb will rely on this lwb to flush the vdevs written to by that
+ * previous lwb. Thus, we need to ensure this lwb doesn't issue the
+ * flush until after the previous lwb's write completes. We ensure
+ * this ordering by setting the zio parent/child relationship here.
*
- * This is required by the semantics of zil_commit(); the commit
- * waiters attached to the lwbs will be woken in the lwb zio's
- * completion callback, so this zio dependency graph ensures the
- * waiters are woken in the correct order (the same order the
- * lwbs were created).
+ * Without this relationship on the lwb's write zio, it's possible
+ * for this lwb's write to complete prior to the previous lwb's write
+ * completing; and thus, the vdevs for the previous lwb would be
+ * flushed prior to that lwb's data being written to those vdevs (the
+ * vdevs are flushed in the lwb write zio's completion handler,
+ * zil_lwb_write_done()).
*/
- if (last_lwb_opened != NULL &&
- last_lwb_opened->lwb_state != LWB_STATE_FLUSH_DONE) {
- ASSERT(last_lwb_opened->lwb_state == LWB_STATE_OPENED ||
- last_lwb_opened->lwb_state == LWB_STATE_ISSUED ||
- last_lwb_opened->lwb_state == LWB_STATE_WRITE_DONE);
-
- ASSERT3P(last_lwb_opened->lwb_root_zio, !=, NULL);
- zio_add_child(lwb->lwb_root_zio,
- last_lwb_opened->lwb_root_zio);
-
- /*
- * If the previous lwb's write hasn't already completed,
- * we also want to order the completion of the lwb write
- * zios (above, we only order the completion of the lwb
- * root zios). This is required because of how we can
- * defer the DKIOCFLUSHWRITECACHE commands for each lwb.
- *
- * When the DKIOCFLUSHWRITECACHE commands are deferred,
- * the previous lwb will rely on this lwb to flush the
- * vdevs written to by that previous lwb. Thus, we need
- * to ensure this lwb doesn't issue the flush until
- * after the previous lwb's write completes. We ensure
- * this ordering by setting the zio parent/child
- * relationship here.
- *
- * Without this relationship on the lwb's write zio,
- * it's possible for this lwb's write to complete prior
- * to the previous lwb's write completing; and thus, the
- * vdevs for the previous lwb would be flushed prior to
- * that lwb's data being written to those vdevs (the
- * vdevs are flushed in the lwb write zio's completion
- * handler, zil_lwb_write_done()).
- */
- if (last_lwb_opened->lwb_state != LWB_STATE_WRITE_DONE) {
- ASSERT(last_lwb_opened->lwb_state == LWB_STATE_OPENED ||
- last_lwb_opened->lwb_state == LWB_STATE_ISSUED);
-
- ASSERT3P(last_lwb_opened->lwb_write_zio, !=, NULL);
- zio_add_child(lwb->lwb_write_zio,
- last_lwb_opened->lwb_write_zio);
- }
+ if (prev_lwb->lwb_state == LWB_STATE_ISSUED) {
+ ASSERT3P(prev_lwb->lwb_write_zio, !=, NULL);
+ zio_add_child(lwb->lwb_write_zio, prev_lwb->lwb_write_zio);
+ } else {
+ ASSERT3S(prev_lwb->lwb_state, ==, LWB_STATE_WRITE_DONE);
}
+
+ ASSERT3P(prev_lwb->lwb_root_zio, !=, NULL);
+ zio_add_child(lwb->lwb_root_zio, prev_lwb->lwb_root_zio);
}
/*
* This function's purpose is to "open" an lwb such that it is ready to
- * accept new itxs being committed to it. To do this, the lwb's zio
- * structures are created, and linked to the lwb. This function is
- * idempotent; if the passed in lwb has already been opened, this
- * function is essentially a no-op.
+ * accept new itxs being committed to it. This function is idempotent; if
+ * the passed in lwb has already been opened, it is essentially a no-op.
*/
static void
zil_lwb_write_open(zilog_t *zilog, lwb_t *lwb)
{
- zbookmark_phys_t zb;
- zio_priority_t prio;
-
ASSERT(MUTEX_HELD(&zilog->zl_issuer_lock));
- ASSERT3P(lwb, !=, NULL);
- EQUIV(lwb->lwb_root_zio == NULL, lwb->lwb_state == LWB_STATE_CLOSED);
- EQUIV(lwb->lwb_root_zio != NULL, lwb->lwb_state == LWB_STATE_OPENED);
- if (lwb->lwb_root_zio != NULL)
+ if (lwb->lwb_state != LWB_STATE_NEW) {
+ ASSERT3S(lwb->lwb_state, ==, LWB_STATE_OPENED);
return;
-
- lwb->lwb_root_zio = zio_root(zilog->zl_spa,
- zil_lwb_flush_vdevs_done, lwb, ZIO_FLAG_CANFAIL);
-
- abd_t *lwb_abd = abd_get_from_buf(lwb->lwb_buf,
- BP_GET_LSIZE(&lwb->lwb_blk));
-
- if (!lwb->lwb_slog || zilog->zl_cur_used <= zil_slog_bulk)
- prio = ZIO_PRIORITY_SYNC_WRITE;
- else
- prio = ZIO_PRIORITY_ASYNC_WRITE;
-
- SET_BOOKMARK(&zb, lwb->lwb_blk.blk_cksum.zc_word[ZIL_ZC_OBJSET],
- ZB_ZIL_OBJECT, ZB_ZIL_LEVEL,
- lwb->lwb_blk.blk_cksum.zc_word[ZIL_ZC_SEQ]);
-
- lwb->lwb_write_zio = zio_rewrite(lwb->lwb_root_zio, zilog->zl_spa, 0,
- &lwb->lwb_blk, lwb_abd, BP_GET_LSIZE(&lwb->lwb_blk),
- zil_lwb_write_done, lwb, prio, ZIO_FLAG_CANFAIL, &zb);
+ }
mutex_enter(&zilog->zl_lock);
lwb->lwb_state = LWB_STATE_OPENED;
- zil_lwb_set_zio_dependency(zilog, lwb);
zilog->zl_last_lwb_opened = lwb;
mutex_exit(&zilog->zl_lock);
}
* Has to be called under zl_issuer_lock to chain more lwbs.
*/
static lwb_t *
-zil_lwb_write_close(zilog_t *zilog, lwb_t *lwb, list_t *ilwbs)
+zil_lwb_write_close(zilog_t *zilog, lwb_t *lwb, lwb_state_t state)
{
- lwb_t *nlwb = NULL;
- zil_chain_t *zilc;
- spa_t *spa = zilog->zl_spa;
- blkptr_t *bp;
- dmu_tx_t *tx;
- uint64_t txg;
- uint64_t zil_blksz;
- int i, error;
- boolean_t slog;
+ int i;
ASSERT(MUTEX_HELD(&zilog->zl_issuer_lock));
- ASSERT3P(lwb->lwb_root_zio, !=, NULL);
- ASSERT3P(lwb->lwb_write_zio, !=, NULL);
ASSERT3S(lwb->lwb_state, ==, LWB_STATE_OPENED);
+ lwb->lwb_state = LWB_STATE_CLOSED;
/*
- * If this lwb includes indirect writes, we have to commit before
- * creating the transaction, otherwise we may end up in dead lock.
- */
- if (lwb->lwb_indirect) {
- for (itx_t *itx = list_head(&lwb->lwb_itxs); itx;
- itx = list_next(&lwb->lwb_itxs, itx))
- zil_lwb_commit(zilog, lwb, itx);
- lwb->lwb_nused = lwb->lwb_nfilled;
- }
-
- /*
- * Allocate the next block and save its address in this block
- * before writing it in order to establish the log chain.
+ * If there was an allocation failure then returned NULL will trigger
+ * zil_commit_writer_stall() at the caller. This is inherently racy,
+ * since allocation may not have happened yet.
*/
-
- tx = dmu_tx_create(zilog->zl_os);
-
- /*
- * Since we are not going to create any new dirty data, and we
- * can even help with clearing the existing dirty data, we
- * should not be subject to the dirty data based delays. We
- * use TXG_NOTHROTTLE to bypass the delay mechanism.
- */
- VERIFY0(dmu_tx_assign(tx, TXG_WAIT | TXG_NOTHROTTLE));
-
- dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx);
- txg = dmu_tx_get_txg(tx);
-
- mutex_enter(&zilog->zl_lwb_io_lock);
- lwb->lwb_issued_txg = txg;
- zilog->zl_lwb_inflight[txg & TXG_MASK]++;
- zilog->zl_lwb_max_issued_txg = MAX(txg, zilog->zl_lwb_max_issued_txg);
- mutex_exit(&zilog->zl_lwb_io_lock);
+ if (lwb->lwb_error != 0)
+ return (NULL);
/*
* Log blocks are pre-allocated. Here we select the size of the next
* the maximum block size because we can exhaust the available
* pool log space.
*/
- zil_blksz = zilog->zl_cur_used + sizeof (zil_chain_t);
+ uint64_t zil_blksz = zilog->zl_cur_used + sizeof (zil_chain_t);
for (i = 0; zil_blksz > zil_block_buckets[i].limit; i++)
continue;
zil_blksz = MIN(zil_block_buckets[i].blksz, zilog->zl_max_block_size);
uint64_t, zilog->zl_prev_blks[zilog->zl_prev_rotor]);
zilog->zl_prev_rotor = (zilog->zl_prev_rotor + 1) & (ZIL_PREV_BLKS - 1);
- if (BP_GET_CHECKSUM(&lwb->lwb_blk) == ZIO_CHECKSUM_ZILOG2)
- zilc = (zil_chain_t *)lwb->lwb_buf;
- else
- zilc = (zil_chain_t *)(lwb->lwb_buf + lwb->lwb_sz);
- bp = &zilc->zc_next_blk;
- BP_ZERO(bp);
- error = zio_alloc_zil(spa, zilog->zl_os, txg, bp, zil_blksz, &slog);
- if (error == 0) {
- ASSERT3U(bp->blk_birth, ==, txg);
- bp->blk_cksum = lwb->lwb_blk.blk_cksum;
- bp->blk_cksum.zc_word[ZIL_ZC_SEQ]++;
+ return (zil_alloc_lwb(zilog, zil_blksz, NULL, 0, 0, state));
+}
- /*
- * Allocate a new log write block (lwb).
- */
- nlwb = zil_alloc_lwb(zilog, bp, slog, txg);
- }
+/*
+ * Finalize previously closed block and issue the write zio.
+ */
+static void
+zil_lwb_write_issue(zilog_t *zilog, lwb_t *lwb)
+{
+ spa_t *spa = zilog->zl_spa;
+ zil_chain_t *zilc;
+ boolean_t slog;
+ zbookmark_phys_t zb;
+ zio_priority_t prio;
+ int error;
- lwb->lwb_state = LWB_STATE_ISSUED;
+ ASSERT3S(lwb->lwb_state, ==, LWB_STATE_CLOSED);
- dmu_tx_commit(tx);
+ /* Actually fill the lwb with the data. */
+ for (itx_t *itx = list_head(&lwb->lwb_itxs); itx;
+ itx = list_next(&lwb->lwb_itxs, itx))
+ zil_lwb_commit(zilog, lwb, itx);
+ lwb->lwb_nused = lwb->lwb_nfilled;
+
+ lwb->lwb_root_zio = zio_root(spa, zil_lwb_flush_vdevs_done, lwb,
+ ZIO_FLAG_CANFAIL);
/*
- * We need to acquire the config lock for the lwb to issue it later.
- * However, if we already have a queue of closed parent lwbs already
- * holding the config lock (but not yet issued), we can't block here
- * waiting on the lock or we will deadlock. In that case we must
- * first issue to parent IOs before waiting on the lock.
+ * The lwb is now ready to be issued, but it can be only if it already
+ * got its block pointer allocated or the allocation has failed.
+ * Otherwise leave it as-is, relying on some other thread to issue it
+ * after allocating its block pointer via calling zil_lwb_write_issue()
+ * for the previous lwb(s) in the chain.
*/
- if (ilwbs && !list_is_empty(ilwbs)) {
- if (!spa_config_tryenter(spa, SCL_STATE, lwb, RW_READER)) {
- lwb_t *tlwb;
- while ((tlwb = list_remove_head(ilwbs)) != NULL)
- zil_lwb_write_issue(zilog, tlwb);
- spa_config_enter(spa, SCL_STATE, lwb, RW_READER);
+ mutex_enter(&zilog->zl_lock);
+ lwb->lwb_state = LWB_STATE_READY;
+ if (BP_IS_HOLE(&lwb->lwb_blk) && lwb->lwb_error == 0) {
+ mutex_exit(&zilog->zl_lock);
+ return;
+ }
+ mutex_exit(&zilog->zl_lock);
+
+next_lwb:
+ if (lwb->lwb_slim)
+ zilc = (zil_chain_t *)lwb->lwb_buf;
+ else
+ zilc = (zil_chain_t *)(lwb->lwb_buf + lwb->lwb_nmax);
+ int wsz = lwb->lwb_sz;
+ if (lwb->lwb_error == 0) {
+ abd_t *lwb_abd = abd_get_from_buf(lwb->lwb_buf, lwb->lwb_sz);
+ if (!lwb->lwb_slog || zilog->zl_cur_used <= zil_slog_bulk)
+ prio = ZIO_PRIORITY_SYNC_WRITE;
+ else
+ prio = ZIO_PRIORITY_ASYNC_WRITE;
+ SET_BOOKMARK(&zb, lwb->lwb_blk.blk_cksum.zc_word[ZIL_ZC_OBJSET],
+ ZB_ZIL_OBJECT, ZB_ZIL_LEVEL,
+ lwb->lwb_blk.blk_cksum.zc_word[ZIL_ZC_SEQ]);
+ lwb->lwb_write_zio = zio_rewrite(lwb->lwb_root_zio, spa, 0,
+ &lwb->lwb_blk, lwb_abd, lwb->lwb_sz, zil_lwb_write_done,
+ lwb, prio, ZIO_FLAG_CANFAIL, &zb);
+ zil_lwb_add_block(lwb, &lwb->lwb_blk);
+
+ if (lwb->lwb_slim) {
+ /* For Slim ZIL only write what is used. */
+ wsz = P2ROUNDUP_TYPED(lwb->lwb_nused, ZIL_MIN_BLKSZ,
+ int);
+ ASSERT3S(wsz, <=, lwb->lwb_sz);
+ zio_shrink(lwb->lwb_write_zio, wsz);
+ wsz = lwb->lwb_write_zio->io_size;
}
+ memset(lwb->lwb_buf + lwb->lwb_nused, 0, wsz - lwb->lwb_nused);
+ zilc->zc_pad = 0;
+ zilc->zc_nused = lwb->lwb_nused;
+ zilc->zc_eck.zec_cksum = lwb->lwb_blk.blk_cksum;
} else {
- spa_config_enter(spa, SCL_STATE, lwb, RW_READER);
+ /*
+ * We can't write the lwb if there was an allocation failure,
+ * so create a null zio instead just to maintain dependencies.
+ */
+ lwb->lwb_write_zio = zio_null(lwb->lwb_root_zio, spa, NULL,
+ zil_lwb_write_done, lwb, ZIO_FLAG_CANFAIL);
+ lwb->lwb_write_zio->io_error = lwb->lwb_error;
}
-
- if (ilwbs)
- list_insert_tail(ilwbs, lwb);
+ if (lwb->lwb_child_zio)
+ zio_add_child(lwb->lwb_write_zio, lwb->lwb_child_zio);
/*
- * If there was an allocation failure then nlwb will be null which
- * forces a txg_wait_synced().
+ * Open transaction to allocate the next block pointer.
*/
- return (nlwb);
-}
+ dmu_tx_t *tx = dmu_tx_create(zilog->zl_os);
+ VERIFY0(dmu_tx_assign(tx, TXG_WAIT | TXG_NOTHROTTLE));
+ dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx);
+ uint64_t txg = dmu_tx_get_txg(tx);
-/*
- * Finalize previously closed block and issue the write zio.
- * Does not require locking.
- */
-static void
-zil_lwb_write_issue(zilog_t *zilog, lwb_t *lwb)
-{
- zil_chain_t *zilc;
- int wsz;
-
- /* Actually fill the lwb with the data if not yet. */
- if (!lwb->lwb_indirect) {
- for (itx_t *itx = list_head(&lwb->lwb_itxs); itx;
- itx = list_next(&lwb->lwb_itxs, itx))
- zil_lwb_commit(zilog, lwb, itx);
- lwb->lwb_nused = lwb->lwb_nfilled;
+ /*
+ * Allocate next the block pointer unless we are already in error.
+ */
+ lwb_t *nlwb = list_next(&zilog->zl_lwb_list, lwb);
+ blkptr_t *bp = &zilc->zc_next_blk;
+ BP_ZERO(bp);
+ error = lwb->lwb_error;
+ if (error == 0) {
+ error = zio_alloc_zil(spa, zilog->zl_os, txg, bp, nlwb->lwb_sz,
+ &slog);
+ }
+ if (error == 0) {
+ ASSERT3U(bp->blk_birth, ==, txg);
+ BP_SET_CHECKSUM(bp, nlwb->lwb_slim ? ZIO_CHECKSUM_ZILOG2 :
+ ZIO_CHECKSUM_ZILOG);
+ bp->blk_cksum = lwb->lwb_blk.blk_cksum;
+ bp->blk_cksum.zc_word[ZIL_ZC_SEQ]++;
}
- if (BP_GET_CHECKSUM(&lwb->lwb_blk) == ZIO_CHECKSUM_ZILOG2) {
- /* For Slim ZIL only write what is used. */
- wsz = P2ROUNDUP_TYPED(lwb->lwb_nused, ZIL_MIN_BLKSZ, int);
- ASSERT3S(wsz, <=, lwb->lwb_sz);
- zio_shrink(lwb->lwb_write_zio, wsz);
- wsz = lwb->lwb_write_zio->io_size;
+ /*
+ * Reduce TXG open time by incrementing inflight counter and committing
+ * the transaciton. zil_sync() will wait for it to return to zero.
+ */
+ mutex_enter(&zilog->zl_lwb_io_lock);
+ lwb->lwb_issued_txg = txg;
+ zilog->zl_lwb_inflight[txg & TXG_MASK]++;
+ zilog->zl_lwb_max_issued_txg = MAX(txg, zilog->zl_lwb_max_issued_txg);
+ mutex_exit(&zilog->zl_lwb_io_lock);
+ dmu_tx_commit(tx);
- zilc = (zil_chain_t *)lwb->lwb_buf;
- } else {
- wsz = lwb->lwb_sz;
- zilc = (zil_chain_t *)(lwb->lwb_buf + lwb->lwb_sz);
- }
- zilc->zc_pad = 0;
- zilc->zc_nused = lwb->lwb_nused;
- zilc->zc_eck.zec_cksum = lwb->lwb_blk.blk_cksum;
+ spa_config_enter(spa, SCL_STATE, lwb, RW_READER);
/*
- * clear unused data for security
+ * We've completed all potentially blocking operations. Update the
+ * nlwb and allow it proceed without possible lock order reversals.
*/
- memset(lwb->lwb_buf + lwb->lwb_nused, 0, wsz - lwb->lwb_nused);
+ mutex_enter(&zilog->zl_lock);
+ zil_lwb_set_zio_dependency(zilog, lwb);
+ lwb->lwb_state = LWB_STATE_ISSUED;
+
+ if (nlwb) {
+ nlwb->lwb_blk = *bp;
+ nlwb->lwb_error = error;
+ nlwb->lwb_slog = slog;
+ nlwb->lwb_alloc_txg = txg;
+ if (nlwb->lwb_state != LWB_STATE_READY)
+ nlwb = NULL;
+ }
+ mutex_exit(&zilog->zl_lock);
if (lwb->lwb_slog) {
ZIL_STAT_BUMP(zilog, zil_itx_metaslab_slog_count);
ZIL_STAT_INCR(zilog, zil_itx_metaslab_normal_alloc,
BP_GET_LSIZE(&lwb->lwb_blk));
}
- ASSERT(spa_config_held(zilog->zl_spa, SCL_STATE, RW_READER));
- zil_lwb_add_block(lwb, &lwb->lwb_blk);
lwb->lwb_issued_timestamp = gethrtime();
zio_nowait(lwb->lwb_root_zio);
zio_nowait(lwb->lwb_write_zio);
+ if (lwb->lwb_child_zio)
+ zio_nowait(lwb->lwb_child_zio);
+
+ /*
+ * If nlwb was ready when we gave it the block pointer,
+ * it is on us to issue it and possibly following ones.
+ */
+ lwb = nlwb;
+ if (lwb)
+ goto next_lwb;
}
/*
* For more details, see the comment above zil_commit().
*/
if (lr->lrc_txtype == TX_COMMIT) {
- mutex_enter(&zilog->zl_lock);
zil_commit_waiter_link_lwb(itx->itx_private, lwb);
- itx->itx_private = NULL;
- mutex_exit(&zilog->zl_lock);
list_insert_tail(&lwb->lwb_itxs, itx);
return (lwb);
}
* If this record won't fit in the current log block, start a new one.
* For WR_NEED_COPY optimize layout for minimal number of chunks.
*/
- lwb_sp = lwb->lwb_sz - lwb->lwb_nused;
+ lwb_sp = lwb->lwb_nmax - lwb->lwb_nused;
max_log_data = zil_max_log_data(zilog, sizeof (lr_write_t));
if (reclen > lwb_sp || (reclen + dlen > lwb_sp &&
lwb_sp < zil_max_waste_space(zilog) &&
(dlen % max_log_data == 0 ||
lwb_sp < reclen + dlen % max_log_data))) {
- lwb = zil_lwb_write_close(zilog, lwb, ilwbs);
+ list_insert_tail(ilwbs, lwb);
+ lwb = zil_lwb_write_close(zilog, lwb, LWB_STATE_OPENED);
if (lwb == NULL)
return (NULL);
- zil_lwb_write_open(zilog, lwb);
- lwb_sp = lwb->lwb_sz - lwb->lwb_nused;
+ lwb_sp = lwb->lwb_nmax - lwb->lwb_nused;
/*
* There must be enough space in the new, empty log block to
clr->lrc_seq = ++zilog->zl_lr_seq;
lwb->lwb_nused += reclen + dnow;
- ASSERT3U(lwb->lwb_nused, <=, lwb->lwb_sz);
+ ASSERT3U(lwb->lwb_nused, <=, lwb->lwb_nmax);
ASSERT0(P2PHASE(lwb->lwb_nused, sizeof (uint64_t)));
zil_lwb_add_txg(lwb, lr->lrc_txg);
goto cont;
}
- /*
- * We have to really issue all queued LWBs before we may have to
- * wait for a txg sync. Otherwise we may end up in a dead lock.
- */
- if (lr->lrc_txtype == TX_WRITE) {
- boolean_t frozen = lr->lrc_txg > spa_freeze_txg(zilog->zl_spa);
- if (frozen || itx->itx_wr_state == WR_INDIRECT) {
- lwb_t *tlwb;
- while ((tlwb = list_remove_head(ilwbs)) != NULL)
- zil_lwb_write_issue(zilog, tlwb);
- }
- if (itx->itx_wr_state == WR_INDIRECT)
- lwb->lwb_indirect = B_TRUE;
- if (frozen)
- txg_wait_synced(zilog->zl_dmu_pool, lr->lrc_txg);
- }
+ if (lr->lrc_txtype == TX_WRITE &&
+ lr->lrc_txg > spa_freeze_txg(zilog->zl_spa))
+ txg_wait_synced(zilog->zl_dmu_pool, lr->lrc_txg);
return (lwb);
}
ZIL_STAT_BUMP(zilog, zil_itx_indirect_count);
ZIL_STAT_INCR(zilog, zil_itx_indirect_bytes,
lrw->lr_length);
+ if (lwb->lwb_child_zio == NULL) {
+ lwb->lwb_child_zio = zio_root(
+ zilog->zl_spa, NULL, NULL,
+ ZIO_FLAG_CANFAIL);
+ }
}
/*
- * We pass in the "lwb_write_zio" rather than
- * "lwb_root_zio" so that the "lwb_write_zio"
- * becomes the parent of any zio's created by
- * the "zl_get_data" callback. The vdevs are
- * flushed after the "lwb_write_zio" completes,
- * so we want to make sure that completion
- * callback waits for these additional zio's,
- * such that the vdevs used by those zio's will
- * be included in the lwb's vdev tree, and those
- * vdevs will be properly flushed. If we passed
- * in "lwb_root_zio" here, then these additional
- * vdevs may not be flushed; e.g. if these zio's
- * completed after "lwb_write_zio" completed.
+ * The "lwb_child_zio" we pass in will become a child of
+ * "lwb_write_zio", when one is created, so one will be
+ * a parent of any zio's created by the "zl_get_data".
+ * This way "lwb_write_zio" will first wait for children
+ * block pointers before own writing, and then for their
+ * writing completion before the vdev cache flushing.
*/
error = zilog->zl_get_data(itx->itx_private,
itx->itx_gen, lrwb, dbuf, lwb,
- lwb->lwb_write_zio);
+ lwb->lwb_child_zio);
if (dbuf != NULL && error == 0) {
/* Zero any padding bytes in the last block. */
memset((char *)dbuf + lrwb->lr_length, 0,
error);
zfs_fallthrough;
case EIO:
- if (lwb->lwb_indirect) {
- txg_wait_synced(zilog->zl_dmu_pool,
- lr->lrc_txg);
- } else {
- lwb->lwb_write_zio->io_error = error;
- }
+ txg_wait_synced(zilog->zl_dmu_pool,
+ lr->lrc_txg);
zfs_fallthrough;
case ENOENT:
zfs_fallthrough;
zil_commit_waiter_skip(itx->itx_private);
} else {
zil_commit_waiter_link_lwb(itx->itx_private, last_lwb);
- itx->itx_private = NULL;
}
mutex_exit(&zilog->zl_lock);
* have already been created (zl_lwb_list not empty).
*/
zil_commit_activate_saxattr_feature(zilog);
- ASSERT3S(lwb->lwb_state, !=, LWB_STATE_ISSUED);
- ASSERT3S(lwb->lwb_state, !=, LWB_STATE_WRITE_DONE);
- ASSERT3S(lwb->lwb_state, !=, LWB_STATE_FLUSH_DONE);
- first = (lwb->lwb_state != LWB_STATE_OPENED) &&
+ ASSERT(lwb->lwb_state == LWB_STATE_NEW ||
+ lwb->lwb_state == LWB_STATE_OPENED);
+ first = (lwb->lwb_state == LWB_STATE_NEW) &&
((plwb = list_prev(&zilog->zl_lwb_list, lwb)) == NULL ||
plwb->lwb_state == LWB_STATE_FLUSH_DONE);
}
} else {
ASSERT(list_is_empty(&nolwb_waiters));
ASSERT3P(lwb, !=, NULL);
- ASSERT3S(lwb->lwb_state, !=, LWB_STATE_ISSUED);
- ASSERT3S(lwb->lwb_state, !=, LWB_STATE_WRITE_DONE);
- ASSERT3S(lwb->lwb_state, !=, LWB_STATE_FLUSH_DONE);
+ ASSERT(lwb->lwb_state == LWB_STATE_NEW ||
+ lwb->lwb_state == LWB_STATE_OPENED);
/*
* At this point, the ZIL block pointed at by the "lwb"
- * variable is in one of the following states: "closed"
- * or "open".
+ * variable is in "new" or "opened" state.
*
- * If it's "closed", then no itxs have been committed to
- * it, so there's no point in issuing its zio (i.e. it's
- * "empty").
+ * If it's "new", then no itxs have been committed to it, so
+ * there's no point in issuing its zio (i.e. it's "empty").
*
- * If it's "open", then it contains one or more itxs that
+ * If it's "opened", then it contains one or more itxs that
* eventually need to be committed to stable storage. In
* this case we intentionally do not issue the lwb's zio
* to disk yet, and instead rely on one of the following
* two mechanisms for issuing the zio:
*
- * 1. Ideally, there will be more ZIL activity occurring
- * on the system, such that this function will be
- * immediately called again (not necessarily by the same
- * thread) and this lwb's zio will be issued via
- * zil_lwb_assign(). This way, the lwb is guaranteed to
- * be "full" when it is issued to disk, and we'll make
- * use of the lwb's size the best we can.
+ * 1. Ideally, there will be more ZIL activity occurring on
+ * the system, such that this function will be immediately
+ * called again by different thread and this lwb will be
+ * closed by zil_lwb_assign(). This way, the lwb will be
+ * "full" when it is issued to disk, and we'll make use of
+ * the lwb's size the best we can.
*
* 2. If there isn't sufficient ZIL activity occurring on
- * the system, such that this lwb's zio isn't issued via
- * zil_lwb_assign(), zil_commit_waiter() will issue the
- * lwb's zio. If this occurs, the lwb is not guaranteed
+ * the system, zil_commit_waiter() will close it and issue
+ * the zio. If this occurs, the lwb is not guaranteed
* to be "full" by the time its zio is issued, and means
* the size of the lwb was "too large" given the amount
* of ZIL activity occurring on the system at that time.
hrtime_t sleep = zilog->zl_last_lwb_latency *
zfs_commit_timeout_pct / 100;
if (sleep < zil_min_commit_timeout ||
- lwb->lwb_sz - lwb->lwb_nused < lwb->lwb_sz / 8) {
- lwb = zil_lwb_write_close(zilog, lwb, ilwbs);
+ lwb->lwb_nmax - lwb->lwb_nused <
+ lwb->lwb_nmax / 8) {
+ list_insert_tail(ilwbs, lwb);
+ lwb = zil_lwb_write_close(zilog, lwb,
+ LWB_STATE_NEW);
zilog->zl_cur_used = 0;
if (lwb == NULL) {
while ((lwb = list_remove_head(ilwbs))
lwb_t *lwb = zcw->zcw_lwb;
ASSERT3P(lwb, !=, NULL);
- ASSERT3S(lwb->lwb_state, !=, LWB_STATE_CLOSED);
+ ASSERT3S(lwb->lwb_state, !=, LWB_STATE_NEW);
/*
* If the lwb has already been issued by another thread, we can
* do this prior to acquiring the zl_issuer_lock, to avoid
* acquiring it when it's not necessary to do so.
*/
- if (lwb->lwb_state == LWB_STATE_ISSUED ||
- lwb->lwb_state == LWB_STATE_WRITE_DONE ||
- lwb->lwb_state == LWB_STATE_FLUSH_DONE)
+ if (lwb->lwb_state != LWB_STATE_OPENED)
return;
/*
* wind up with a use-after-free error below.
*/
if (zcw->zcw_done) {
- lwb = NULL;
- goto out;
+ mutex_exit(&zilog->zl_issuer_lock);
+ return;
}
ASSERT3P(lwb, ==, zcw->zcw_lwb);
* second time while holding the lock.
*
* We don't need to hold the zl_lock since the lwb cannot transition
- * from OPENED to ISSUED while we hold the zl_issuer_lock. The lwb
- * _can_ transition from ISSUED to DONE, but it's OK to race with
+ * from OPENED to CLOSED while we hold the zl_issuer_lock. The lwb
+ * _can_ transition from CLOSED to DONE, but it's OK to race with
* that transition since we treat the lwb the same, whether it's in
- * the ISSUED or DONE states.
+ * the CLOSED, ISSUED or DONE states.
*
* The important thing, is we treat the lwb differently depending on
- * if it's ISSUED or OPENED, and block any other threads that might
- * attempt to issue this lwb. For that reason we hold the
+ * if it's OPENED or CLOSED, and block any other threads that might
+ * attempt to close/issue this lwb. For that reason we hold the
* zl_issuer_lock when checking the lwb_state; we must not call
- * zil_lwb_write_close() if the lwb had already been issued.
+ * zil_lwb_write_close() if the lwb had already been closed/issued.
*
* See the comment above the lwb_state_t structure definition for
* more details on the lwb states, and locking requirements.
*/
- if (lwb->lwb_state == LWB_STATE_ISSUED ||
- lwb->lwb_state == LWB_STATE_WRITE_DONE ||
- lwb->lwb_state == LWB_STATE_FLUSH_DONE) {
- lwb = NULL;
- goto out;
+ if (lwb->lwb_state != LWB_STATE_OPENED) {
+ mutex_exit(&zilog->zl_issuer_lock);
+ return;
}
- ASSERT3S(lwb->lwb_state, ==, LWB_STATE_OPENED);
+ /*
+ * We do not need zcw_lock once we hold zl_issuer_lock and know lwb
+ * is still open. But we have to drop it to avoid a deadlock in case
+ * callback of zio issued by zil_lwb_write_issue() try to get it,
+ * while zil_lwb_write_issue() is blocked on attempt to issue next
+ * lwb it found in LWB_STATE_READY state.
+ */
+ mutex_exit(&zcw->zcw_lock);
/*
* As described in the comments above zil_commit_waiter() and
* since we've reached the commit waiter's timeout and it still
* hasn't been issued.
*/
- lwb_t *nlwb = zil_lwb_write_close(zilog, lwb, NULL);
+ lwb_t *nlwb = zil_lwb_write_close(zilog, lwb, LWB_STATE_NEW);
- ASSERT3S(lwb->lwb_state, !=, LWB_STATE_OPENED);
+ ASSERT3S(lwb->lwb_state, ==, LWB_STATE_CLOSED);
/*
* Since the lwb's zio hadn't been issued by the time this thread
* "next" lwb on-disk. When this occurs, the ZIL write
* pipeline must be stalled; see the comment within the
* zil_commit_writer_stall() function for more details.
- *
- * We must drop the commit waiter's lock prior to
- * calling zil_commit_writer_stall() or else we can wind
- * up with the following deadlock:
- *
- * - This thread is waiting for the txg to sync while
- * holding the waiter's lock; txg_wait_synced() is
- * used within txg_commit_writer_stall().
- *
- * - The txg can't sync because it is waiting for this
- * lwb's zio callback to call dmu_tx_commit().
- *
- * - The lwb's zio callback can't call dmu_tx_commit()
- * because it's blocked trying to acquire the waiter's
- * lock, which occurs prior to calling dmu_tx_commit()
*/
- mutex_exit(&zcw->zcw_lock);
zil_lwb_write_issue(zilog, lwb);
- lwb = NULL;
zil_commit_writer_stall(zilog);
- mutex_enter(&zcw->zcw_lock);
- }
-
-out:
- mutex_exit(&zilog->zl_issuer_lock);
- if (lwb)
+ mutex_exit(&zilog->zl_issuer_lock);
+ } else {
+ mutex_exit(&zilog->zl_issuer_lock);
zil_lwb_write_issue(zilog, lwb);
- ASSERT(MUTEX_HELD(&zcw->zcw_lock));
+ }
+ mutex_enter(&zcw->zcw_lock);
}
/*
* where it's "zcw_lwb" field is NULL, and it hasn't yet
* been skipped, so it's "zcw_done" field is still B_FALSE.
*/
- IMPLY(lwb != NULL, lwb->lwb_state != LWB_STATE_CLOSED);
+ IMPLY(lwb != NULL, lwb->lwb_state != LWB_STATE_NEW);
if (lwb != NULL && lwb->lwb_state == LWB_STATE_OPENED) {
ASSERT3B(timedout, ==, B_FALSE);
*/
IMPLY(lwb != NULL,
+ lwb->lwb_state == LWB_STATE_CLOSED ||
+ lwb->lwb_state == LWB_STATE_READY ||
lwb->lwb_state == LWB_STATE_ISSUED ||
lwb->lwb_state == LWB_STATE_WRITE_DONE ||
lwb->lwb_state == LWB_STATE_FLUSH_DONE);
while ((lwb = list_head(&zilog->zl_lwb_list)) != NULL) {
zh->zh_log = lwb->lwb_blk;
if (lwb->lwb_state != LWB_STATE_FLUSH_DONE ||
- lwb->lwb_max_txg > txg)
+ lwb->lwb_alloc_txg > txg || lwb->lwb_max_txg > txg)
break;
list_remove(&zilog->zl_lwb_list, lwb);
- zio_free(spa, txg, &lwb->lwb_blk);
+ if (!BP_IS_HOLE(&lwb->lwb_blk))
+ zio_free(spa, txg, &lwb->lwb_blk);
zil_free_lwb(zilog, lwb);
/*
}
mutex_enter(&zilog->zl_lock);
+ txg = zilog->zl_dirty_max_txg;
lwb = list_tail(&zilog->zl_lwb_list);
- if (lwb == NULL)
- txg = zilog->zl_dirty_max_txg;
- else
- txg = MAX(zilog->zl_dirty_max_txg, lwb->lwb_max_txg);
+ if (lwb != NULL) {
+ txg = MAX(txg, lwb->lwb_alloc_txg);
+ txg = MAX(txg, lwb->lwb_max_txg);
+ }
mutex_exit(&zilog->zl_lock);
/*
* zl_lwb_max_issued_txg may be larger than lwb_max_txg. It depends
* on the time when the dmu_tx transaction is assigned in
- * zil_lwb_write_close().
+ * zil_lwb_write_issue().
*/
mutex_enter(&zilog->zl_lwb_io_lock);
txg = MAX(zilog->zl_lwb_max_issued_txg, txg);
lwb = list_remove_head(&zilog->zl_lwb_list);
if (lwb != NULL) {
ASSERT(list_is_empty(&zilog->zl_lwb_list));
- ASSERT3S(lwb->lwb_state, !=, LWB_STATE_ISSUED);
-
+ ASSERT3S(lwb->lwb_state, ==, LWB_STATE_NEW);
zio_buf_free(lwb->lwb_buf, lwb->lwb_sz);
zil_free_lwb(zilog, lwb);
}
/*
* We need to use zil_commit_impl to ensure we wait for all
- * LWB_STATE_OPENED and LWB_STATE_ISSUED lwbs to be committed
+ * LWB_STATE_OPENED, _CLOSED and _READY lwbs to be committed
* to disk before proceeding. If we used zil_commit instead, it
* would just call txg_wait_synced(), because zl_suspend is set.
* txg_wait_synced() doesn't wait for these lwb's to be