OpenZFS 8997 - ztest assertion failure in zil_lwb_write_issue

[mirror_zfs.git] / module / zfs / zil.c
diff --git a/module/zfs/zil.c b/module/zfs/zil.c

index b3b0699005ad2ee50a666a8d6daf53f3d699c7ba..a2bbdcb9aa5d72cbd0b007723b152499f0390e82 100644 (file)
--- a/module/zfs/zil.c
+++ b/module/zfs/zil.c
@@ -20,7 +20,8 @@
   */
  /*
   * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2011, 2015 by Delphix. All rights reserved.
+ * Copyright (c) 2011, 2017 by Delphix. All rights reserved.
+ * Copyright (c) 2014 Integros [integros.com]
   */
  
  /* Portions Copyright 2010 Robert Milkowski */
@@ -101,6 +102,13 @@ int zil_replay_disable = 0;
   */
  int zfs_nocacheflush = 0;
  
+/*
+ * Limit SLOG write size per commit executed with synchronous priority.
+ * Any writes above that will be executed with lower (asynchronous) priority
+ * to limit potential SLOG device abuse by single active ZIL writer.
+ */
+unsigned long zil_slog_bulk = 768 * 1024;
+
  static kmem_cache_t *zil_lwb_cache;
  
  static void zil_async_to_sync(zilog_t *zilog, uint64_t foid);
@@ -108,16 +116,6 @@ static void zil_async_to_sync(zilog_t *zilog, uint64_t foid);
  #define        LWB_EMPTY(lwb) ((BP_GET_LSIZE(&lwb->lwb_blk) - \
      sizeof (zil_chain_t)) == (lwb->lwb_sz - lwb->lwb_nused))
  
-
-/*
- * ziltest is by and large an ugly hack, but very useful in
- * checking replay without tedious work.
- * When running ziltest we want to keep all itx's and so maintain
- * a single list in the zl_itxg[] that uses a high txg: ZILTEST_TXG
- * We subtract TXG_CONCURRENT_STATES to allow for common code.
- */
-#define        ZILTEST_TXG (UINT64_MAX - TXG_CONCURRENT_STATES)
-
  static int
  zil_bp_compare(const void *x1, const void *x2)
  {
@@ -458,7 +456,8 @@ zil_free_log_record(zilog_t *zilog, lr_t *lrc, void *tx, uint64_t claim_txg)
  }
  
  static lwb_t *
-zil_alloc_lwb(zilog_t *zilog, blkptr_t *bp, uint64_t txg, boolean_t fastwrite)
+zil_alloc_lwb(zilog_t *zilog, blkptr_t *bp, boolean_t slog, uint64_t txg,
+    boolean_t fastwrite)
  {
         lwb_t *lwb;
  
@@ -466,6 +465,7 @@ zil_alloc_lwb(zilog_t *zilog, blkptr_t *bp, uint64_t txg, boolean_t fastwrite)
         lwb->lwb_zilog = zilog;
         lwb->lwb_blk = *bp;
         lwb->lwb_fastwrite = fastwrite;
+       lwb->lwb_slog = slog;
         lwb->lwb_buf = zio_buf_alloc(BP_GET_LSIZE(bp));
         lwb->lwb_max_txg = txg;
         lwb->lwb_zio = NULL;
@@ -504,6 +504,27 @@ zilog_dirty(zilog_t *zilog, uint64_t txg)
         }
  }
  
+/*
+ * Determine if the zil is dirty in the specified txg. Callers wanting to
+ * ensure that the dirty state does not change must hold the itxg_lock for
+ * the specified txg. Holding the lock will ensure that the zil cannot be
+ * dirtied (zil_itx_assign) or cleaned (zil_clean) while we check its current
+ * state.
+ */
+boolean_t
+zilog_is_dirty_in_txg(zilog_t *zilog, uint64_t txg)
+{
+       dsl_pool_t *dp = zilog->zl_dmu_pool;
+
+       if (txg_list_member(&dp->dp_dirty_zilogs, zilog, txg & TXG_MASK))
+               return (B_TRUE);
+       return (B_FALSE);
+}
+
+/*
+ * Determine if the zil is dirty. The zil is considered dirty if it has
+ * any pending itx records that have not been cleaned by zil_clean().
+ */
  boolean_t
  zilog_is_dirty(zilog_t *zilog)
  {
@@ -530,6 +551,7 @@ zil_create(zilog_t *zilog)
         blkptr_t blk;
         int error = 0;
         boolean_t fastwrite = FALSE;
+       boolean_t slog = FALSE;
  
         /*
          * Wait for any previous destroy to complete.
@@ -544,7 +566,7 @@ zil_create(zilog_t *zilog)
         /*
          * Allocate an initial log block if:
          *    - there isn't one already
-        *    - the existing block is the wrong endianess
+        *    - the existing block is the wrong endianness
          */
         if (BP_IS_HOLE(&blk) || BP_SHOULD_BYTESWAP(&blk)) {
                 tx = dmu_tx_create(zilog->zl_os);
@@ -558,7 +580,7 @@ zil_create(zilog_t *zilog)
                 }
  
                 error = zio_alloc_zil(zilog->zl_spa, txg, &blk,
-                   ZIL_MIN_BLKSZ, B_TRUE);
+                   ZIL_MIN_BLKSZ, &slog);
                 fastwrite = TRUE;
  
                 if (error == 0)
@@ -569,7 +591,7 @@ zil_create(zilog_t *zilog)
          * Allocate a log write buffer (lwb) for the first log block.
          */
         if (error == 0)
-               lwb = zil_alloc_lwb(zilog, &blk, txg, fastwrite);
+               lwb = zil_alloc_lwb(zilog, &blk, slog, txg, fastwrite);
  
         /*
          * If we just allocated the first log block, commit our transaction
@@ -903,6 +925,7 @@ static void
  zil_lwb_write_init(zilog_t *zilog, lwb_t *lwb)
  {
         zbookmark_phys_t zb;
+       zio_priority_t prio;
  
         SET_BOOKMARK(&zb, lwb->lwb_blk.blk_cksum.zc_word[ZIL_ZC_OBJSET],
             ZB_ZIL_OBJECT, ZB_ZIL_LEVEL,
@@ -922,9 +945,13 @@ zil_lwb_write_init(zilog_t *zilog, lwb_t *lwb)
                         metaslab_fastwrite_mark(zilog->zl_spa, &lwb->lwb_blk);
                         lwb->lwb_fastwrite = 1;
                 }
+               if (!lwb->lwb_slog || zilog->zl_cur_used <= zil_slog_bulk)
+                       prio = ZIO_PRIORITY_SYNC_WRITE;
+               else
+                       prio = ZIO_PRIORITY_ASYNC_WRITE;
                 lwb->lwb_zio = zio_rewrite(zilog->zl_root_zio, zilog->zl_spa,
                     0, &lwb->lwb_blk, lwb_abd, BP_GET_LSIZE(&lwb->lwb_blk),
-                   zil_lwb_write_done, lwb, ZIO_PRIORITY_SYNC_WRITE,
+                   zil_lwb_write_done, lwb, prio,
                     ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE |
                     ZIO_FLAG_FASTWRITE, &zb);
         }
@@ -945,15 +972,6 @@ uint64_t zil_block_buckets[] = {
      UINT64_MAX
  };
  
-/*
- * Use the slog as long as the current commit size is less than the
- * limit or the total list size is less than 2X the limit.  Limit
- * checking is disabled by setting zil_slog_limit to UINT64_MAX.
- */
-unsigned long zil_slog_limit = 1024 * 1024;
-#define        USE_SLOG(zilog) (((zilog)->zl_cur_used < zil_slog_limit) || \
-       ((zilog)->zl_itx_list_sz < (zil_slog_limit << 1)))
-
  /*
   * Start a log block write and advance to the next log block.
   * Calls are serialized.
@@ -969,7 +987,7 @@ zil_lwb_write_start(zilog_t *zilog, lwb_t *lwb)
         uint64_t txg;
         uint64_t zil_blksz, wsz;
         int i, error;
-       boolean_t use_slog;
+       boolean_t slog;
  
         if (BP_GET_CHECKSUM(&lwb->lwb_blk) == ZIO_CHECKSUM_ZILOG2) {
                 zilc = (zil_chain_t *)lwb->lwb_buf;
@@ -991,7 +1009,15 @@ zil_lwb_write_start(zilog_t *zilog, lwb_t *lwb)
          * to clean up in the event of allocation failure or I/O failure.
          */
         tx = dmu_tx_create(zilog->zl_os);
-       VERIFY(dmu_tx_assign(tx, TXG_WAIT) == 0);
+
+       /*
+        * Since we are not going to create any new dirty data, and we
+        * can even help with clearing the existing dirty data, we
+        * should not be subject to the dirty data based delays. We
+        * use TXG_NOTHROTTLE to bypass the delay mechanism.
+        */
+       VERIFY0(dmu_tx_assign(tx, TXG_WAIT | TXG_NOTHROTTLE));
+
         dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx);
         txg = dmu_tx_get_txg(tx);
  
@@ -1025,10 +1051,8 @@ zil_lwb_write_start(zilog_t *zilog, lwb_t *lwb)
         zilog->zl_prev_rotor = (zilog->zl_prev_rotor + 1) & (ZIL_PREV_BLKS - 1);
  
         BP_ZERO(bp);
-       use_slog = USE_SLOG(zilog);
-       error = zio_alloc_zil(spa, txg, bp, zil_blksz,
-           USE_SLOG(zilog));
-       if (use_slog) {
+       error = zio_alloc_zil(spa, txg, bp, zil_blksz, &slog);
+       if (slog) {
                 ZIL_STAT_BUMP(zil_itx_metaslab_slog_count);
                 ZIL_STAT_INCR(zil_itx_metaslab_slog_bytes, lwb->lwb_nused);
         } else {
@@ -1043,7 +1067,7 @@ zil_lwb_write_start(zilog_t *zilog, lwb_t *lwb)
                 /*
                  * Allocate a new log write buffer (lwb).
                  */
-               nlwb = zil_alloc_lwb(zilog, bp, txg, TRUE);
+               nlwb = zil_alloc_lwb(zilog, bp, slog, txg, TRUE);
  
                 /* Record the block for later vdev flushing */
                 zil_add_block(zilog, &lwb->lwb_blk);
@@ -1080,47 +1104,53 @@ zil_lwb_write_start(zilog_t *zilog, lwb_t *lwb)
  static lwb_t *
  zil_lwb_commit(zilog_t *zilog, itx_t *itx, lwb_t *lwb)
  {
-       lr_t *lrc = &itx->itx_lr; /* common log record */
-       lr_write_t *lrw = (lr_write_t *)lrc;
+       lr_t *lrcb, *lrc;
+       lr_write_t *lrwb, *lrw;
         char *lr_buf;
-       uint64_t txg = lrc->lrc_txg;
-       uint64_t reclen = lrc->lrc_reclen;
-       uint64_t dlen = 0;
+       uint64_t dlen, dnow, lwb_sp, reclen, txg;
  
         if (lwb == NULL)
                 return (NULL);
  
         ASSERT(lwb->lwb_buf != NULL);
-       ASSERT(zilog_is_dirty(zilog) ||
-           spa_freeze_txg(zilog->zl_spa) != UINT64_MAX);
  
-       if (lrc->lrc_txtype == TX_WRITE && itx->itx_wr_state == WR_NEED_COPY)
+       lrc = &itx->itx_lr;             /* Common log record inside itx. */
+       lrw = (lr_write_t *)lrc;        /* Write log record inside itx. */
+       if (lrc->lrc_txtype == TX_WRITE && itx->itx_wr_state == WR_NEED_COPY) {
                 dlen = P2ROUNDUP_TYPED(
                     lrw->lr_length, sizeof (uint64_t), uint64_t);
-
+       } else {
+               dlen = 0;
+       }
+       reclen = lrc->lrc_reclen;
         zilog->zl_cur_used += (reclen + dlen);
+       txg = lrc->lrc_txg;
  
         zil_lwb_write_init(zilog, lwb);
  
+cont:
         /*
          * If this record won't fit in the current log block, start a new one.
+        * For WR_NEED_COPY optimize layout for minimal number of chunks.
          */
-       if (lwb->lwb_nused + reclen + dlen > lwb->lwb_sz) {
+       lwb_sp = lwb->lwb_sz - lwb->lwb_nused;
+       if (reclen > lwb_sp || (reclen + dlen > lwb_sp &&
+           lwb_sp < ZIL_MAX_WASTE_SPACE && (dlen % ZIL_MAX_LOG_DATA == 0 ||
+           lwb_sp < reclen + dlen % ZIL_MAX_LOG_DATA))) {
                 lwb = zil_lwb_write_start(zilog, lwb);
                 if (lwb == NULL)
                         return (NULL);
                 zil_lwb_write_init(zilog, lwb);
                 ASSERT(LWB_EMPTY(lwb));
-               if (lwb->lwb_nused + reclen + dlen > lwb->lwb_sz) {
-                       txg_wait_synced(zilog->zl_dmu_pool, txg);
-                       return (lwb);
-               }
+               lwb_sp = lwb->lwb_sz - lwb->lwb_nused;
+               ASSERT3U(reclen + MIN(dlen, sizeof (uint64_t)), <=, lwb_sp);
         }
  
+       dnow = MIN(dlen, lwb_sp - reclen);
         lr_buf = lwb->lwb_buf + lwb->lwb_nused;
         bcopy(lrc, lr_buf, reclen);
-       lrc = (lr_t *)lr_buf;
-       lrw = (lr_write_t *)lrc;
+       lrcb = (lr_t *)lr_buf;          /* Like lrc, but inside lwb. */
+       lrwb = (lr_write_t *)lrcb;      /* Like lrw, but inside lwb. */
  
         ZIL_STAT_BUMP(zil_itx_count);
  
@@ -1137,13 +1167,15 @@ zil_lwb_commit(zilog_t *zilog, itx_t *itx, lwb_t *lwb)
                         char *dbuf;
                         int error;
  
-                       if (dlen) {
-                               ASSERT(itx->itx_wr_state == WR_NEED_COPY);
+                       if (itx->itx_wr_state == WR_NEED_COPY) {
                                 dbuf = lr_buf + reclen;
-                               lrw->lr_common.lrc_reclen += dlen;
+                               lrcb->lrc_reclen += dnow;
+                               if (lrwb->lr_length > dnow)
+                                       lrwb->lr_length = dnow;
+                               lrw->lr_offset += dnow;
+                               lrw->lr_length -= dnow;
                                 ZIL_STAT_BUMP(zil_itx_needcopy_count);
-                               ZIL_STAT_INCR(zil_itx_needcopy_bytes,
-                                   lrw->lr_length);
+                               ZIL_STAT_INCR(zil_itx_needcopy_bytes, dnow);
                         } else {
                                 ASSERT(itx->itx_wr_state == WR_INDIRECT);
                                 dbuf = NULL;
@@ -1152,7 +1184,7 @@ zil_lwb_commit(zilog_t *zilog, itx_t *itx, lwb_t *lwb)
                                     lrw->lr_length);
                         }
                         error = zilog->zl_get_data(
-                           itx->itx_private, lrw, dbuf, lwb->lwb_zio);
+                           itx->itx_private, lrwb, dbuf, lwb->lwb_zio);
                         if (error == EIO) {
                                 txg_wait_synced(zilog->zl_dmu_pool, txg);
                                 return (lwb);
@@ -1171,30 +1203,38 @@ zil_lwb_commit(zilog_t *zilog, itx_t *itx, lwb_t *lwb)
          * equal to the itx sequence number because not all transactions
          * are synchronous, and sometimes spa_sync() gets there first.
          */
-       lrc->lrc_seq = ++zilog->zl_lr_seq; /* we are single threaded */
-       lwb->lwb_nused += reclen + dlen;
+       lrcb->lrc_seq = ++zilog->zl_lr_seq; /* we are single threaded */
+       lwb->lwb_nused += reclen + dnow;
         lwb->lwb_max_txg = MAX(lwb->lwb_max_txg, txg);
         ASSERT3U(lwb->lwb_nused, <=, lwb->lwb_sz);
         ASSERT0(P2PHASE(lwb->lwb_nused, sizeof (uint64_t)));
  
+       dlen -= dnow;
+       if (dlen > 0) {
+               zilog->zl_cur_used += reclen;
+               goto cont;
+       }
+
         return (lwb);
  }
  
  itx_t *
  zil_itx_create(uint64_t txtype, size_t lrsize)
  {
+       size_t itxsize;
         itx_t *itx;
  
         lrsize = P2ROUNDUP_TYPED(lrsize, sizeof (uint64_t), size_t);
+       itxsize = offsetof(itx_t, itx_lr) + lrsize;
  
-       itx = zio_data_buf_alloc(offsetof(itx_t, itx_lr) + lrsize);
+       itx = zio_data_buf_alloc(itxsize);
         itx->itx_lr.lrc_txtype = txtype;
         itx->itx_lr.lrc_reclen = lrsize;
-       itx->itx_sod = lrsize; /* if write & WR_NEED_COPY will be increased */
         itx->itx_lr.lrc_seq = 0;        /* defensive */
         itx->itx_sync = B_TRUE;         /* default is synchronous */
         itx->itx_callback = NULL;
         itx->itx_callback_data = NULL;
+       itx->itx_size = itxsize;
  
         return (itx);
  }
@@ -1202,7 +1242,7 @@ zil_itx_create(uint64_t txtype, size_t lrsize)
  void
  zil_itx_destroy(itx_t *itx)
  {
-       zio_data_buf_free(itx, offsetof(itx_t, itx_lr)+itx->itx_lr.lrc_reclen);
+       zio_data_buf_free(itx, itx->itx_size);
  }
  
  /*
@@ -1339,11 +1379,10 @@ zil_itx_assign(zilog_t *zilog, itx_t *itx, dmu_tx_t *tx)
                          * this itxg. Save the itxs for release below.
                          * This should be rare.
                          */
-                       atomic_add_64(&zilog->zl_itx_list_sz, -itxg->itxg_sod);
-                       itxg->itxg_sod = 0;
+                       zfs_dbgmsg("zil_itx_assign: missed itx cleanup for "
+                           "txg %llu", itxg->itxg_txg);
                         clean = itxg->itxg_itxs;
                 }
-               ASSERT(itxg->itxg_sod == 0);
                 itxg->itxg_txg = txg;
                 itxs = itxg->itxg_itxs = kmem_zalloc(sizeof (itxs_t),
                     KM_SLEEP);
@@ -1356,8 +1395,6 @@ zil_itx_assign(zilog_t *zilog, itx_t *itx, dmu_tx_t *tx)
         }
         if (itx->itx_sync) {
                 list_insert_tail(&itxs->i_sync_list, itx);
-               atomic_add_64(&zilog->zl_itx_list_sz, itx->itx_sod);
-               itxg->itxg_sod += itx->itx_sod;
         } else {
                 avl_tree_t *t = &itxs->i_async_tree;
                 uint64_t foid =
@@ -1407,8 +1444,6 @@ zil_clean(zilog_t *zilog, uint64_t synced_txg)
         ASSERT3U(itxg->itxg_txg, <=, synced_txg);
         ASSERT(itxg->itxg_txg != 0);
         ASSERT(zilog->zl_clean_taskq != NULL);
-       atomic_add_64(&zilog->zl_itx_list_sz, -itxg->itxg_sod);
-       itxg->itxg_sod = 0;
         clean_me = itxg->itxg_itxs;
         itxg->itxg_itxs = NULL;
         itxg->itxg_txg = 0;
@@ -1432,13 +1467,17 @@ zil_get_commit_list(zilog_t *zilog)
  {
         uint64_t otxg, txg;
         list_t *commit_list = &zilog->zl_itx_commit_list;
-       uint64_t push_sod = 0;
  
         if (spa_freeze_txg(zilog->zl_spa) != UINT64_MAX) /* ziltest support */
                 otxg = ZILTEST_TXG;
         else
                 otxg = spa_last_synced_txg(zilog->zl_spa) + 1;
  
+       /*
+        * This is inherently racy, since there is nothing to prevent
+        * the last synced txg from changing. That's okay since we'll
+        * only commit things in the future.
+        */
         for (txg = otxg; txg < (otxg + TXG_CONCURRENT_STATES); txg++) {
                 itxg_t *itxg = &zilog->zl_itxg[txg & TXG_MASK];
  
@@ -1448,13 +1487,20 @@ zil_get_commit_list(zilog_t *zilog)
                         continue;
                 }
  
+               /*
+                * If we're adding itx records to the zl_itx_commit_list,
+                * then the zil better be dirty in this "txg". We can assert
+                * that here since we're holding the itxg_lock which will
+                * prevent spa_sync from cleaning it. Once we add the itxs
+                * to the zl_itx_commit_list we must commit it to disk even
+                * if it's unnecessary (i.e. the txg was synced).
+                */
+               ASSERT(zilog_is_dirty_in_txg(zilog, txg) ||
+                   spa_freeze_txg(zilog->zl_spa) != UINT64_MAX);
                 list_move_tail(commit_list, &itxg->itxg_itxs->i_sync_list);
-               push_sod += itxg->itxg_sod;
-               itxg->itxg_sod = 0;
  
                 mutex_exit(&itxg->itxg_lock);
         }
-       atomic_add_64(&zilog->zl_itx_list_sz, -push_sod);
  }
  
  /*
@@ -1473,6 +1519,10 @@ zil_async_to_sync(zilog_t *zilog, uint64_t foid)
         else
                 otxg = spa_last_synced_txg(zilog->zl_spa) + 1;
  
+       /*
+        * This is inherently racy, since there is nothing to prevent
+        * the last synced txg from changing.
+        */
         for (txg = otxg; txg < (otxg + TXG_CONCURRENT_STATES); txg++) {
                 itxg_t *itxg = &zilog->zl_itxg[txg & TXG_MASK];
  
@@ -1545,8 +1595,14 @@ zil_commit_writer(zilog_t *zilog)
         for (itx = list_head(&zilog->zl_itx_commit_list); itx != NULL;
             itx = list_next(&zilog->zl_itx_commit_list, itx)) {
                 txg = itx->itx_lr.lrc_txg;
-               ASSERT(txg);
+               ASSERT3U(txg, !=, 0);
  
+               /*
+                * This is inherently racy and may result in us writing
+                * out a log block for a txg that was just synced. This is
+                * ok since we'll end cleaning up that log block the next
+                * time we call zil_sync().
+                */
                 if (txg > spa_last_synced_txg(spa) || txg > spa_freeze_txg(spa))
                         lwb = zil_lwb_commit(zilog, itx, lwb);
         }
@@ -1907,8 +1963,11 @@ zil_close(zilog_t *zilog)
         mutex_exit(&zilog->zl_lock);
         if (txg)
                 txg_wait_synced(zilog->zl_dmu_pool, txg);
+
+       if (zilog_is_dirty(zilog))
+               zfs_dbgmsg("zil (%p) is dirty, txg %llu", zilog, txg);
         if (txg < spa_freeze_txg(zilog->zl_spa))
-               ASSERT(!zilog_is_dirty(zilog));
+               VERIFY(!zilog_is_dirty(zilog));
  
         taskq_destroy(zilog->zl_clean_taskq);
         zilog->zl_clean_taskq = NULL;
@@ -2264,13 +2323,14 @@ EXPORT_SYMBOL(zil_bp_tree_add);
  EXPORT_SYMBOL(zil_set_sync);
  EXPORT_SYMBOL(zil_set_logbias);
  
+/* BEGIN CSTYLED */
  module_param(zil_replay_disable, int, 0644);
  MODULE_PARM_DESC(zil_replay_disable, "Disable intent logging replay");
  
  module_param(zfs_nocacheflush, int, 0644);
  MODULE_PARM_DESC(zfs_nocacheflush, "Disable cache flushes");
  
-/* CSTYLED */
-module_param(zil_slog_limit, ulong, 0644);
-MODULE_PARM_DESC(zil_slog_limit, "Max commit bytes to separate log device");
+module_param(zil_slog_bulk, ulong, 0644);
+MODULE_PARM_DESC(zil_slog_bulk, "Limit in bytes slog sync writes per commit");
+/* END CSTYLED */
  #endif