OpenZFS 8909 - 8585 can cause a use-after-free kernel panic

author Prakash Surya <prakash.surya@delphix.com>

Thu, 7 Dec 2017 19:26:32 +0000 (11:26 -0800)

committer Brian Behlendorf <behlendorf1@llnl.gov>

Thu, 28 Dec 2017 18:18:04 +0000 (10:18 -0800)
author Prakash Surya <prakash.surya@delphix.com>
Thu, 7 Dec 2017 19:26:32 +0000 (11:26 -0800)
committer Brian Behlendorf <behlendorf1@llnl.gov>
Thu, 28 Dec 2017 18:18:04 +0000 (10:18 -0800)
diff --git a/include/sys/zil.h b/include/sys/zil.h

index 5513c7ce8ed999885b16392b4fac390898ef3a2d..fc0cc506592ebc69f8a930e6927b43c89963579f 100644 (file)
--- a/include/sys/zil.h
+++ b/include/sys/zil.h
@@ -494,6 +494,7 @@ extern void zil_itx_destroy(itx_t *itx);
  extern void    zil_itx_assign(zilog_t *zilog, itx_t *itx, dmu_tx_t *tx);
  
  extern void    zil_commit(zilog_t *zilog, uint64_t oid);
+extern void    zil_commit_impl(zilog_t *zilog, uint64_t oid);
  
  extern int     zil_vdev_offline(const char *osname, void *txarg);
  extern int     zil_claim(struct dsl_pool *dp,
diff --git a/include/sys/zil_impl.h b/include/sys/zil_impl.h

index 7e27215ca329b1816ea1145d5fd5348a50cc2c6a..6901d51f083eb01d19aa2ef9855a8b3249a5b1f6 100644 (file)
--- a/include/sys/zil_impl.h
+++ b/include/sys/zil_impl.h
@@ -36,13 +36,38 @@ extern "C" {
  #endif
  
  /*
- * Possbile states for a given lwb structure. An lwb will start out in
- * the "closed" state, and then transition to the "opened" state via a
- * call to zil_lwb_write_open(). After the lwb is "open", it can
- * transition into the "issued" state via zil_lwb_write_issue(). After
- * the lwb's zio completes, and the vdev's are flushed, the lwb will
- * transition into the "done" state via zil_lwb_write_done(), and the
- * structure eventually freed.
+ * Possible states for a given lwb structure.
+ *
+ * An lwb will start out in the "closed" state, and then transition to
+ * the "opened" state via a call to zil_lwb_write_open(). When
+ * transitioning from "closed" to "opened" the zilog's "zl_issuer_lock"
+ * must be held.
+ *
+ * After the lwb is "opened", it can transition into the "issued" state
+ * via zil_lwb_write_issue(). Again, the zilog's "zl_issuer_lock" must
+ * be held when making this transition.
+ *
+ * After the lwb's zio completes, and the vdev's are flushed, the lwb
+ * will transition into the "done" state via zil_lwb_write_done(). When
+ * transitioning from "issued" to "done", the zilog's "zl_lock" must be
+ * held, *not* the "zl_issuer_lock".
+ *
+ * The zilog's "zl_issuer_lock" can become heavily contended in certain
+ * workloads, so we specifically avoid acquiring that lock when
+ * transitioning an lwb from "issued" to "done". This allows us to avoid
+ * having to acquire the "zl_issuer_lock" for each lwb ZIO completion,
+ * which would have added more lock contention on an already heavily
+ * contended lock.
+ *
+ * Additionally, correctness when reading an lwb's state is often
+ * achieved by exploiting the fact that these state transitions occur in
+ * this specific order; i.e. "closed" to "opened" to "issued" to "done".
+ *
+ * Thus, if an lwb is in the "closed" or "opened" state, holding the
+ * "zl_issuer_lock" will prevent a concurrent thread from transitioning
+ * that lwb to the "issued" state. Likewise, if an lwb is already in the
+ * "issued" state, holding the "zl_lock" will prevent a concurrent
+ * thread from transitioning that lwb to the "done" state.
   */
  typedef enum {
      LWB_STATE_CLOSED,
diff --git a/include/sys/zio.h b/include/sys/zio.h

index 8beea7adf1e674466a60a75c3ab8877324f4d924..bf4818e16cfea064597676b9f58a487a4ddff7ce 100644 (file)
--- a/include/sys/zio.h
+++ b/include/sys/zio.h
@@ -598,7 +598,6 @@ extern enum zio_checksum zio_checksum_dedup_select(spa_t *spa,
  extern enum zio_compress zio_compress_select(spa_t *spa,
      enum zio_compress child, enum zio_compress parent);
  
-extern void zio_cancel(zio_t *zio);
  extern void zio_suspend(spa_t *spa, zio_t *zio);
  extern int zio_resume(spa_t *spa);
  extern void zio_resume_wait(spa_t *spa);
diff --git a/man/man5/zfs-module-parameters.5 b/man/man5/zfs-module-parameters.5

index e6e61906128ee283445953c88a54ccb646941ef7..4dbf7d766a1e31f37bda4f6dbccfca2ead55b8b6 100644 (file)
--- a/man/man5/zfs-module-parameters.5
+++ b/man/man5/zfs-module-parameters.5
@@ -775,6 +775,21 @@ Disable pool import at module load by ignoring the cache file (typically \fB/etc
  Use \fB1\fR for yes (default) and \fB0\fR for no.
  .RE
  
+.sp
+.ne 2
+.na
+\fBzfs_commit_timeout_pct\fR (int)
+.ad
+.RS 12n
+This controls the amount of time that a ZIL block (lwb) will remain "open"
+when it isn't "full", and it has a thread waiting for it to be committed to
+stable storage.  The timeout is scaled based on a percentage of the last lwb
+latency to avoid significantly impacting the latency of each individual
+transaction record (itx).
+.sp
+Default value: \fB5\fR.
+.RE
+
  .sp
  .ne 2
  .na
@@ -2120,7 +2135,7 @@ Default value: \fB0\fR.
  .ad
  .RS 12n
  The maximum number of taskq entries that are allowed to be cached.  When this
-limit is exceeded itx's will be cleaned synchronously.
+limit is exceeded transaction records (itxs) will be cleaned synchronously.
  .sp
  Default value: \fB1048576\fR.
  .RE
diff --git a/module/zfs/zil.c b/module/zfs/zil.c

index b5685d73b8f220e9cba55b83867d90003b3ca3bd..81bc6de412e0f9a890c7ed43a70f368dc0d0a2c5 100644 (file)
--- a/module/zfs/zil.c
+++ b/module/zfs/zil.c
@@ -57,7 +57,7 @@
   *
   * In the event of a crash or power loss, the itxs contained by each
   * dataset's on-disk ZIL will be replayed when that dataset is first
- * instantianted (e.g. if the dataset is a normal fileystem, when it is
+ * instantiated (e.g. if the dataset is a normal fileystem, when it is
   * first mounted).
   *
   * As hinted at above, there is one ZIL per dataset (both the in-memory
@@ -539,8 +539,8 @@ zil_alloc_lwb(zilog_t *zilog, blkptr_t *bp, boolean_t slog, uint64_t txg,
  
         ASSERT(!MUTEX_HELD(&lwb->lwb_vdev_lock));
         ASSERT(avl_is_empty(&lwb->lwb_vdev_tree));
-       ASSERT(list_is_empty(&lwb->lwb_waiters));
-       ASSERT(list_is_empty(&lwb->lwb_itxs));
+       VERIFY(list_is_empty(&lwb->lwb_waiters));
+       VERIFY(list_is_empty(&lwb->lwb_itxs));
  
         return (lwb);
  }
@@ -550,40 +550,14 @@ zil_free_lwb(zilog_t *zilog, lwb_t *lwb)
  {
         ASSERT(MUTEX_HELD(&zilog->zl_lock));
         ASSERT(!MUTEX_HELD(&lwb->lwb_vdev_lock));
-       ASSERT(list_is_empty(&lwb->lwb_waiters));
-
-       if (lwb->lwb_state == LWB_STATE_OPENED) {
-               avl_tree_t *t = &lwb->lwb_vdev_tree;
-               void *cookie = NULL;
-               zil_vdev_node_t *zv;
-               itx_t *itx;
-
-               while ((zv = avl_destroy_nodes(t, &cookie)) != NULL)
-                       kmem_free(zv, sizeof (*zv));
-
-               while ((itx = list_head(&lwb->lwb_itxs)) != NULL) {
-                       if (itx->itx_callback != NULL)
-                               itx->itx_callback(itx->itx_callback_data);
-                       list_remove(&lwb->lwb_itxs, itx);
-                       zil_itx_destroy(itx);
-               }
-
-               ASSERT3P(lwb->lwb_root_zio, !=, NULL);
-               ASSERT3P(lwb->lwb_write_zio, !=, NULL);
-
-               zio_cancel(lwb->lwb_root_zio);
-               zio_cancel(lwb->lwb_write_zio);
-
-               lwb->lwb_root_zio = NULL;
-               lwb->lwb_write_zio = NULL;
-       } else {
-               ASSERT3S(lwb->lwb_state, !=, LWB_STATE_ISSUED);
-       }
-
+       VERIFY(list_is_empty(&lwb->lwb_waiters));
+       VERIFY(list_is_empty(&lwb->lwb_itxs));
         ASSERT(avl_is_empty(&lwb->lwb_vdev_tree));
-       ASSERT(list_is_empty(&lwb->lwb_itxs));
         ASSERT3P(lwb->lwb_write_zio, ==, NULL);
         ASSERT3P(lwb->lwb_root_zio, ==, NULL);
+       ASSERT3U(lwb->lwb_max_txg, <=, spa_syncing_txg(zilog->zl_spa));
+       ASSERT(lwb->lwb_state == LWB_STATE_CLOSED ||
+           lwb->lwb_state == LWB_STATE_DONE);
  
         /*
          * Clear the zilog's field to indicate this lwb is no longer
@@ -708,7 +682,7 @@ zil_create(zilog_t *zilog)
  
         /*
          * If we just allocated the first log block, commit our transaction
-        * and wait for zil_sync() to stuff the block poiner into zh_log.
+        * and wait for zil_sync() to stuff the block pointer into zh_log.
          * (zh is part of the MOS, so we cannot modify it in open context.)
          */
         if (tx != NULL) {
@@ -940,6 +914,12 @@ zil_commit_waiter_skip(zil_commit_waiter_t *zcw)
  static void
  zil_commit_waiter_link_lwb(zil_commit_waiter_t *zcw, lwb_t *lwb)
  {
+       /*
+        * The lwb_waiters field of the lwb is protected by the zilog's
+        * zl_lock, thus it must be held when calling this function.
+        */
+       ASSERT(MUTEX_HELD(&lwb->lwb_zilog->zl_lock));
+
         mutex_enter(&zcw->zcw_lock);
         ASSERT(!list_link_active(&zcw->zcw_node));
         ASSERT3P(zcw->zcw_lwb, ==, NULL);
@@ -1319,7 +1299,7 @@ zil_lwb_write_issue(zilog_t *zilog, lwb_t *lwb)
          *   close.
          * - next find the maximum from the new suggested size and an array of
          *   previous sizes. This lessens a picket fence effect of wrongly
-        *   guesssing the size if we have a stream of say 2k, 64k, 2k, 64k
+        *   guessing the size if we have a stream of say 2k, 64k, 2k, 64k
          *   requests.
          *
          * Note we only write what is used, but we can't just allocate
@@ -1422,8 +1402,10 @@ zil_lwb_commit(zilog_t *zilog, itx_t *itx, lwb_t *lwb)
          * For more details, see the comment above zil_commit().
          */
         if (lrc->lrc_txtype == TX_COMMIT) {
+               mutex_enter(&zilog->zl_lock);
                 zil_commit_waiter_link_lwb(itx->itx_private, lwb);
                 itx->itx_private = NULL;
+               mutex_exit(&zilog->zl_lock);
                 return (lwb);
         }
  
@@ -2056,16 +2038,54 @@ zil_process_commit_list(zilog_t *zilog)
  
                 list_remove(&zilog->zl_itx_commit_list, itx);
  
-               /*
-                * This is inherently racy and may result in us writing
-                * out a log block for a txg that was just synced. This
-                * is ok since we'll end cleaning up that log block the
-                * next time we call zil_sync().
-                */
                 boolean_t synced = txg <= spa_last_synced_txg(spa);
                 boolean_t frozen = txg > spa_freeze_txg(spa);
  
-               if (!synced || frozen) {
+               /*
+                * If the txg of this itx has already been synced out, then
+                * we don't need to commit this itx to an lwb. This is
+                * because the data of this itx will have already been
+                * written to the main pool. This is inherently racy, and
+                * it's still ok to commit an itx whose txg has already
+                * been synced; this will result in a write that's
+                * unnecessary, but will do no harm.
+                *
+                * With that said, we always want to commit TX_COMMIT itxs
+                * to an lwb, regardless of whether or not that itx's txg
+                * has been synced out. We do this to ensure any OPENED lwb
+                * will always have at least one zil_commit_waiter_t linked
+                * to the lwb.
+                *
+                * As a counter-example, if we skipped TX_COMMIT itx's
+                * whose txg had already been synced, the following
+                * situation could occur if we happened to be racing with
+                * spa_sync:
+                *
+                * 1. We commit a non-TX_COMMIT itx to an lwb, where the
+                *    itx's txg is 10 and the last synced txg is 9.
+                * 2. spa_sync finishes syncing out txg 10.
+                * 3. We move to the next itx in the list, it's a TX_COMMIT
+                *    whose txg is 10, so we skip it rather than committing
+                *    it to the lwb used in (1).
+                *
+                * If the itx that is skipped in (3) is the last TX_COMMIT
+                * itx in the commit list, than it's possible for the lwb
+                * used in (1) to remain in the OPENED state indefinitely.
+                *
+                * To prevent the above scenario from occurring, ensuring
+                * that once an lwb is OPENED it will transition to ISSUED
+                * and eventually DONE, we always commit TX_COMMIT itx's to
+                * an lwb here, even if that itx's txg has already been
+                * synced.
+                *
+                * Finally, if the pool is frozen, we _always_ commit the
+                * itx.  The point of freezing the pool is to prevent data
+                * from being written to the main pool via spa_sync, and
+                * instead rely solely on the ZIL to persistently store the
+                * data; i.e.  when the pool is frozen, the last synced txg
+                * value can't be trusted.
+                */
+               if (frozen || !synced || lrc->lrc_txtype == TX_COMMIT) {
                         if (lwb != NULL) {
                                 lwb = zil_lwb_commit(zilog, itx, lwb);
  
@@ -2082,20 +2102,7 @@ zil_process_commit_list(zilog_t *zilog)
                                 list_insert_tail(&nolwb_itxs, itx);
                         }
                 } else {
-                       /*
-                        * If this is a commit itx, then there will be a
-                        * thread that is either: already waiting for
-                        * it, or soon will be waiting.
-                        *
-                        * This itx has already been committed to disk
-                        * via spa_sync() so we don't bother committing
-                        * it to an lwb. As a result, we cannot use the
-                        * lwb zio callback to signal the waiter and
-                        * mark it as done, so we must do that here.
-                        */
-                       if (lrc->lrc_txtype == TX_COMMIT)
-                               zil_commit_waiter_skip(itx->itx_private);
-
+                       ASSERT3S(lrc->lrc_txtype, !=, TX_COMMIT);
                         zil_itx_destroy(itx);
                 }
         }
@@ -2141,37 +2148,37 @@ zil_process_commit_list(zilog_t *zilog)
                  * variable is in one of the following states: "closed"
                  * or "open".
                  *
-                * If its "closed", then no itxs have been committed to
-                * it, so there's no point in issueing its zio (i.e.
-                * it's "empty").
+                * If it's "closed", then no itxs have been committed to
+                * it, so there's no point in issuing its zio (i.e. it's
+                * "empty").
                  *
-                * If its "open" state, then it contains one or more
-                * itxs that eventually need to be committed to stable
-                * storage. In this case we intentionally do not issue
-                * the lwb's zio to disk yet, and instead rely on one of
-                * the following two mechanisms for issuing the zio:
+                * If it's "open", then it contains one or more itxs that
+                * eventually need to be committed to stable storage. In
+                * this case we intentionally do not issue the lwb's zio
+                * to disk yet, and instead rely on one of the following
+                * two mechanisms for issuing the zio:
                  *
-                * 1. Ideally, there will be more ZIL activity occuring
+                * 1. Ideally, there will be more ZIL activity occurring
                  * on the system, such that this function will be
-                * immeidately called again (not necessarily by the same
+                * immediately called again (not necessarily by the same
                  * thread) and this lwb's zio will be issued via
                  * zil_lwb_commit(). This way, the lwb is guaranteed to
                  * be "full" when it is issued to disk, and we'll make
                  * use of the lwb's size the best we can.
                  *
-                * 2. If there isn't sufficient ZIL activity occuring on
+                * 2. If there isn't sufficient ZIL activity occurring on
                  * the system, such that this lwb's zio isn't issued via
                  * zil_lwb_commit(), zil_commit_waiter() will issue the
                  * lwb's zio. If this occurs, the lwb is not guaranteed
                  * to be "full" by the time its zio is issued, and means
                  * the size of the lwb was "too large" given the amount
-                * of ZIL activity occuring on the system at that time.
+                * of ZIL activity occurring on the system at that time.
                  *
                  * We do this for a couple of reasons:
                  *
                  * 1. To try and reduce the number of IOPs needed to
                  * write the same number of itxs. If an lwb has space
-                * available in it's buffer for more itxs, and more itxs
+                * available in its buffer for more itxs, and more itxs
                  * will be committed relatively soon (relative to the
                  * latency of performing a write), then it's beneficial
                  * to wait for these "next" itxs. This way, more itxs
@@ -2193,7 +2200,7 @@ zil_process_commit_list(zilog_t *zilog)
   * itxs will be processed. The assumption is the passed in waiter's
   * commit itx will found in the queue just like the other non-commit
   * itxs, such that when the entire queue is processed, the waiter will
- * have been commited to an lwb.
+ * have been committed to an lwb.
   *
   * The lwb associated with the passed in waiter is not guaranteed to
   * have been issued by the time this function completes. If the lwb is
@@ -2205,7 +2212,6 @@ zil_commit_writer(zilog_t *zilog, zil_commit_waiter_t *zcw)
  {
         ASSERT(!MUTEX_HELD(&zilog->zl_lock));
         ASSERT(spa_writeable(zilog->zl_spa));
-       ASSERT0(zilog->zl_suspend);
  
         mutex_enter(&zilog->zl_issuer_lock);
  
@@ -2265,7 +2271,7 @@ zil_commit_waiter_timeout(zilog_t *zilog, zil_commit_waiter_t *zcw)
          * In order to call zil_lwb_write_issue() we must hold the
          * zilog's "zl_issuer_lock". We can't simply acquire that lock,
          * since we're already holding the commit waiter's "zcw_lock",
-        * and those two locks are aquired in the opposite order
+        * and those two locks are acquired in the opposite order
          * elsewhere.
          */
         mutex_exit(&zcw->zcw_lock);
@@ -2286,10 +2292,24 @@ zil_commit_waiter_timeout(zilog_t *zilog, zil_commit_waiter_t *zcw)
         ASSERT3P(lwb, ==, zcw->zcw_lwb);
  
         /*
-        * We've already checked this above, but since we hadn't
-        * acquired the zilog's zl_issuer_lock, we have to perform this
-        * check a second time while holding the lock. We can't call
+        * We've already checked this above, but since we hadn't acquired
+        * the zilog's zl_issuer_lock, we have to perform this check a
+        * second time while holding the lock.
+        *
+        * We don't need to hold the zl_lock since the lwb cannot transition
+        * from OPENED to ISSUED while we hold the zl_issuer_lock. The lwb
+        * _can_ transition from ISSUED to DONE, but it's OK to race with
+        * that transition since we treat the lwb the same, whether it's in
+        * the ISSUED or DONE states.
+        *
+        * The important thing, is we treat the lwb differently depending on
+        * if it's ISSUED or OPENED, and block any other threads that might
+        * attempt to issue this lwb. For that reason we hold the
+        * zl_issuer_lock when checking the lwb_state; we must not call
          * zil_lwb_write_issue() if the lwb had already been issued.
+        *
+        * See the comment above the lwb_state_t structure definition for
+        * more details on the lwb states, and locking requirements.
          */
         if (lwb->lwb_state == LWB_STATE_ISSUED ||
             lwb->lwb_state == LWB_STATE_DONE)
@@ -2315,7 +2335,7 @@ zil_commit_waiter_timeout(zilog_t *zilog, zil_commit_waiter_t *zcw)
          * By having to issue the lwb's zio here, it means the size of the
          * lwb was too large, given the incoming throughput of itxs.  By
          * setting "zl_cur_used" to zero, we communicate this fact to the
-        * block size selection algorithm, so it can take this informaiton
+        * block size selection algorithm, so it can take this information
          * into account, and potentially select a smaller size for the
          * next lwb block that is allocated.
          */
@@ -2379,7 +2399,6 @@ zil_commit_waiter(zilog_t *zilog, zil_commit_waiter_t *zcw)
         ASSERT(!MUTEX_HELD(&zilog->zl_lock));
         ASSERT(!MUTEX_HELD(&zilog->zl_issuer_lock));
         ASSERT(spa_writeable(zilog->zl_spa));
-       ASSERT0(zilog->zl_suspend);
  
         mutex_enter(&zcw->zcw_lock);
  
@@ -2459,7 +2478,7 @@ zil_commit_waiter(zilog_t *zilog, zil_commit_waiter_t *zcw)
                          * complete.
                          *
                          * Additionally, if the lwb is NULL, the waiter
-                        * will soon be signalled and marked done via
+                        * will soon be signaled and marked done via
                          * zil_clean() and zil_itxg_clean(), so no timeout
                          * is required.
                          */
@@ -2560,7 +2579,7 @@ zil_commit_itx_assign(zilog_t *zilog, zil_commit_waiter_t *zcw)
   * When a thread calls zil_commit() a special "commit itx" will be
   * generated, along with a corresponding "waiter" for this commit itx.
   * zil_commit() will wait on this waiter's CV, such that when the waiter
- * is marked done, and signalled, zil_commit() will return.
+ * is marked done, and signaled, zil_commit() will return.
   *
   * This commit itx is inserted into the queue of uncommitted itxs. This
   * provides an easy mechanism for determining which itxs were in the
@@ -2570,7 +2589,7 @@ zil_commit_itx_assign(zilog_t *zilog, zil_commit_waiter_t *zcw)
   * The commit it is special; it doesn't have any on-disk representation.
   * When a commit itx is "committed" to an lwb, the waiter associated
   * with it is linked onto the lwb's list of waiters. Then, when that lwb
- * completes, each waiter on the lwb's list is marked done and signalled
+ * completes, each waiter on the lwb's list is marked done and signaled
   * -- allowing the thread waiting on the waiter to return from zil_commit().
   *
   * It's important to point out a few critical factors that allow us
@@ -2578,10 +2597,10 @@ zil_commit_itx_assign(zilog_t *zilog, zil_commit_waiter_t *zcw)
   * commit waiters, and zio completion callbacks like we're doing:
   *
   *   1. The list of waiters for each lwb is traversed, and each commit
- *      waiter is marked "done" and signalled, in the zio completion
+ *      waiter is marked "done" and signaled, in the zio completion
   *      callback of the lwb's zio[*].
   *
- *      * Actually, the waiters are signalled in the zio completion
+ *      * Actually, the waiters are signaled in the zio completion
   *        callback of the root zio for the DKIOCFLUSHWRITECACHE commands
   *        that are sent to the vdevs upon completion of the lwb zio.
   *
@@ -2629,7 +2648,7 @@ zil_commit_itx_assign(zilog_t *zilog, zil_commit_waiter_t *zcw)
   *             the zilog's "zl_issuer_lock", which ensures only a single
   *             thread is creating and/or issuing lwb's at a time
   *          2. The "previous" lwb is a child of the "current" lwb
- *             (leveraging the zio parent-child depenency graph)
+ *             (leveraging the zio parent-child dependency graph)
   *
   *      By relying on this parent-child zio relationship, we can have
   *      many lwb zio's concurrently issued to the underlying storage,
@@ -2662,7 +2681,7 @@ zil_commit(zilog_t *zilog, uint64_t foid)
                  * If the SPA is not writable, there should never be any
                  * pending itxs waiting to be committed to disk. If that
                  * weren't true, we'd skip writing those itxs out, and
-                * would break the sematics of zil_commit(); thus, we're
+                * would break the semantics of zil_commit(); thus, we're
                  * verifying that truth before we return to the caller.
                  */
                 ASSERT(list_is_empty(&zilog->zl_lwb_list));
@@ -2684,6 +2703,12 @@ zil_commit(zilog_t *zilog, uint64_t foid)
                 return;
         }
  
+       zil_commit_impl(zilog, foid);
+}
+
+void
+zil_commit_impl(zilog_t *zilog, uint64_t foid)
+{
         ZIL_STAT_BUMP(zil_commit_count);
  
         /*
@@ -3149,7 +3174,22 @@ zil_suspend(const char *osname, void **cookiep)
         zilog->zl_suspending = B_TRUE;
         mutex_exit(&zilog->zl_lock);
  
-       zil_commit(zilog, 0);
+       /*
+        * We need to use zil_commit_impl to ensure we wait for all
+        * LWB_STATE_OPENED and LWB_STATE_ISSUED lwbs to be committed
+        * to disk before proceeding. If we used zil_commit instead, it
+        * would just call txg_wait_synced(), because zl_suspend is set.
+        * txg_wait_synced() doesn't wait for these lwb's to be
+        * LWB_STATE_DONE before returning.
+        */
+       zil_commit_impl(zilog, 0);
+
+       /*
+        * Now that we've ensured all lwb's are LWB_STATE_DONE, we use
+        * txg_wait_synced() to ensure the data from the zilog has
+        * migrated to the main pool before calling zil_destroy().
+        */
+       txg_wait_synced(zilog->zl_dmu_pool, 0);
  
         zil_destroy(zilog, B_FALSE);
  
@@ -3400,6 +3440,9 @@ EXPORT_SYMBOL(zil_set_sync);
  EXPORT_SYMBOL(zil_set_logbias);
  
  /* BEGIN CSTYLED */
+module_param(zfs_commit_timeout_pct, int, 0644);
+MODULE_PARM_DESC(zfs_commit_timeout_pct, "ZIL block open timeout percentage");
+
  module_param(zil_replay_disable, int, 0644);
  MODULE_PARM_DESC(zil_replay_disable, "Disable intent logging replay");
  
diff --git a/module/zfs/zio.c b/module/zfs/zio.c

index c3b571e9a7e883d7ad8d5ff6375a47973814c9c5..7d237003423975fc7a2f17482d2a8cee8dfad57f 100644 (file)
--- a/module/zfs/zio.c
+++ b/module/zfs/zio.c
@@ -1987,20 +1987,6 @@ zio_reexecute(zio_t *pio)
         }
  }
  
-void
-zio_cancel(zio_t *zio)
-{
-       /*
-        * Disallow cancellation of a zio that's already been issued.
-        */
-       VERIFY3P(zio->io_executor, ==, NULL);
-
-       zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
-       zio->io_done = NULL;
-
-       zio_nowait(zio);
-}
-
  void
  zio_suspend(spa_t *spa, zio_t *zio)
  {
diff --git a/tests/runfiles/linux.run b/tests/runfiles/linux.run

index c4650ab41b0662a0cfc1bcc296850fa3b277b124..ad35e92be675a4ef4862e5b0541bbbc3d5133ac9 100644 (file)
--- a/tests/runfiles/linux.run
+++ b/tests/runfiles/linux.run
@@ -656,7 +656,8 @@ tags = ['functional', 'scrub_mirror']
  tests = ['slog_001_pos', 'slog_002_pos', 'slog_003_pos', 'slog_004_pos',
      'slog_005_pos', 'slog_006_pos', 'slog_007_pos', 'slog_008_neg',
      'slog_009_neg', 'slog_010_neg', 'slog_011_neg', 'slog_012_neg',
-    'slog_013_pos', 'slog_014_pos', 'slog_replay_fs', 'slog_replay_volume']
+    'slog_013_pos', 'slog_014_pos', 'slog_015_neg', 'slog_replay_fs',
+    'slog_replay_volume']
  tags = ['functional', 'slog']
  
  [tests/functional/snapshot]
diff --git a/tests/zfs-tests/tests/functional/slog/Makefile.am b/tests/zfs-tests/tests/functional/slog/Makefile.am

index f446facfe27b72993f797b1867e09dae2a32d8fe..157e6670713d808f9cf56011ae118efb9fc91e39 100644 (file)
--- a/tests/zfs-tests/tests/functional/slog/Makefile.am
+++ b/tests/zfs-tests/tests/functional/slog/Makefile.am
@@ -18,5 +18,6 @@ dist_pkgdata_SCRIPTS = \
         slog_012_neg.ksh \
         slog_013_pos.ksh \
         slog_014_pos.ksh \
+       slog_015_neg.ksh \
         slog_replay_fs.ksh \
         slog_replay_volume.ksh
diff --git a/tests/zfs-tests/tests/functional/slog/slog_015_neg.ksh b/tests/zfs-tests/tests/functional/slog/slog_015_neg.ksh

new file mode 100755 (executable)

index 0000000..3782188
--- /dev/null
+++ b/tests/zfs-tests/tests/functional/slog/slog_015_neg.ksh
@@ -0,0 +1,73 @@
+#!/bin/ksh -p
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source.  A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+
+#
+# Copyright (c) 2017 by Delphix. All rights reserved.
+#
+
+. $STF_SUITE/tests/functional/slog/slog.kshlib
+
+#
+# DESCRIPTION:
+#      Concurrent sync writes with log offline/online works.
+#
+# STRATEGY:
+#      1. Configure "zfs_commit_timeout_pct"
+#      2. Create pool with a log device.
+#      3. Concurrently do the following:
+#         3.1. Perform 8K sync writes
+#         3.2. Perform log offline/online commands
+#      4. Loop to test with growing "zfs_commit_timout_pct" values.
+#
+
+verify_runnable "global"
+
+function cleanup
+{
+       #
+       # Wait for any of the writes and/or zpool commands that were
+       # kicked off in the background to complete. On failure, we may
+       # enter this function without previously waiting for them.
+       #
+       wait
+
+       set_tunable64 zfs_commit_timeout_pct $ORIG_TIMEOUT
+
+       poolexists $TESTPOOL && zpool destroy -f $TESTPOOL
+}
+
+ORIG_TIMEOUT=$(get_tunable zfs_commit_timeout_pct | tail -1 | awk '{print $NF}')
+log_onexit cleanup
+
+for PCT in 0 1 2 4 8 16 32 64 128 256 512 1024; do
+       log_must set_tunable64 zfs_commit_timeout_pct $PCT
+
+       log_must zpool create $TESTPOOL $VDEV log $SDEV
+
+       for i in {1..10}; do
+               log_must fio --rw write --sync 1 --directory "/$TESTPOOL" \
+                   --bs 8K --size 8K --name slog-test
+       done &
+
+       for i in {1..10}; do
+               log_must zpool offline $TESTPOOL $SDEV
+               log_must verify_slog_device $TESTPOOL $SDEV 'OFFLINE'
+               log_must zpool online $TESTPOOL $SDEV
+               log_must verify_slog_device $TESTPOOL $SDEV 'ONLINE'
+       done &
+
+       wait
+
+       log_must zpool destroy -f $TESTPOOL
+done
+
+log_pass "Concurrent writes with slog offline/online works."
author	Prakash Surya <prakash.surya@delphix.com>
	Thu, 7 Dec 2017 19:26:32 +0000 (11:26 -0800)
committer	Brian Behlendorf <behlendorf1@llnl.gov>
	Thu, 28 Dec 2017 18:18:04 +0000 (10:18 -0800)
include/sys/zil.h		patch \| blob \| blame \| history
include/sys/zil_impl.h		patch \| blob \| blame \| history
include/sys/zio.h		patch \| blob \| blame \| history
man/man5/zfs-module-parameters.5		patch \| blob \| blame \| history
module/zfs/zil.c		patch \| blob \| blame \| history
module/zfs/zio.c		patch \| blob \| blame \| history
tests/runfiles/linux.run		patch \| blob \| blame \| history
tests/zfs-tests/tests/functional/slog/Makefile.am		patch \| blob \| blame \| history
tests/zfs-tests/tests/functional/slog/slog_015_neg.ksh	[new file with mode: 0755]	patch \| blob