module/zfs/mmp.c

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright (c) 2017 by Lawrence Livermore National Security, LLC.
  23  */
  24
  25 #include <sys/abd.h>
  26 #include <sys/mmp.h>
  27 #include <sys/spa.h>
  28 #include <sys/spa_impl.h>
  29 #include <sys/time.h>
  30 #include <sys/vdev.h>
  31 #include <sys/vdev_impl.h>
  32 #include <sys/zfs_context.h>
  33 #include <sys/callb.h>
  34
  35 /*
  36  * Multi-Modifier Protection (MMP) attempts to prevent a user from importing
  37  * or opening a pool on more than one host at a time.  In particular, it
  38  * prevents "zpool import -f" on a host from succeeding while the pool is
  39  * already imported on another host.  There are many other ways in which a
  40  * device could be used by two hosts for different purposes at the same time
  41  * resulting in pool damage.  This implementation does not attempt to detect
  42  * those cases.
  43  *
  44  * MMP operates by ensuring there are frequent visible changes on disk (a
  45  * "heartbeat") at all times.  And by altering the import process to check
  46  * for these changes and failing the import when they are detected.  This
  47  * functionality is enabled by setting the 'multihost' pool property to on.
  48  *
  49  * Uberblocks written by the txg_sync thread always go into the first
  50  * (N-MMP_BLOCKS_PER_LABEL) slots, the remaining slots are reserved for MMP.
  51  * They are used to hold uberblocks which are exactly the same as the last
  52  * synced uberblock except that the ub_timestamp is frequently updated.
  53  * Like all other uberblocks, the slot is written with an embedded checksum,
  54  * and slots with invalid checksums are ignored.  This provides the
  55  * "heartbeat", with no risk of overwriting good uberblocks that must be
  56  * preserved, e.g. previous txgs and associated block pointers.
  57  *
  58  * Two optional fields are added to uberblock structure: ub_mmp_magic and
  59  * ub_mmp_delay.  The magic field allows zfs to tell whether ub_mmp_delay is
  60  * valid.  The delay field is a decaying average of the amount of time between
  61  * completion of successive MMP writes, in nanoseconds.  It is used to predict
  62  * how long the import must wait to detect activity in the pool, before
  63  * concluding it is not in use.
  64  *
  65  * During import an activity test may now be performed to determine if
  66  * the pool is in use.  The activity test is typically required if the
  67  * ZPOOL_CONFIG_HOSTID does not match the system hostid, the pool state is
  68  * POOL_STATE_ACTIVE, and the pool is not a root pool.
  69  *
  70  * The activity test finds the "best" uberblock (highest txg & timestamp),
  71  * waits some time, and then finds the "best" uberblock again.  If the txg
  72  * and timestamp in both "best" uberblocks do not match, the pool is in use
  73  * by another host and the import fails.  Since the granularity of the
  74  * timestamp is in seconds this activity test must take a bare minimum of one
  75  * second.  In order to assure the accuracy of the activity test, the default
  76  * values result in an activity test duration of 10x the mmp write interval.
  77  *
  78  * The "zpool import"  activity test can be expected to take a minimum time of
  79  * zfs_multihost_import_intervals * zfs_multihost_interval milliseconds.  If the
  80  * "best" uberblock has a valid ub_mmp_delay field, then the duration of the
  81  * test may take longer if MMP writes were occurring less frequently than
  82  * expected.  Additionally, the duration is then extended by a random 25% to
  83  * attempt to to detect simultaneous imports.  For example, if both partner
  84  * hosts are rebooted at the same time and automatically attempt to import the
  85  * pool.
  86  */
  87
  88 /*
  89  * Used to control the frequency of mmp writes which are performed when the
  90  * 'multihost' pool property is on.  This is one factor used to determine the
  91  * length of the activity check during import.
  92  *
  93  * The mmp write period is zfs_multihost_interval / leaf-vdevs milliseconds.
  94  * This means that on average an mmp write will be issued for each leaf vdev
  95  * every zfs_multihost_interval milliseconds.  In practice, the observed period
  96  * can vary with the I/O load and this observed value is the delay which is
  97  * stored in the uberblock.  The minimum allowed value is 100 ms.
  98  */
  99 ulong_t zfs_multihost_interval = MMP_DEFAULT_INTERVAL;
 100
 101 /*
 102  * Used to control the duration of the activity test on import.  Smaller values
 103  * of zfs_multihost_import_intervals will reduce the import time but increase
 104  * the risk of failing to detect an active pool.  The total activity check time
 105  * is never allowed to drop below one second.  A value of 0 is ignored and
 106  * treated as if it was set to 1.
 107  */
 108 uint_t zfs_multihost_import_intervals = MMP_DEFAULT_IMPORT_INTERVALS;
 109
 110 /*
 111  * Controls the behavior of the pool when mmp write failures are detected.
 112  *
 113  * When zfs_multihost_fail_intervals = 0 then mmp write failures are ignored.
 114  * The failures will still be reported to the ZED which depending on its
 115  * configuration may take action such as suspending the pool or taking a
 116  * device offline.
 117  *
 118  * When zfs_multihost_fail_intervals > 0 then sequential mmp write failures will
 119  * cause the pool to be suspended.  This occurs when
 120  * zfs_multihost_fail_intervals * zfs_multihost_interval milliseconds have
 121  * passed since the last successful mmp write.  This guarantees the activity
 122  * test will see mmp writes if the
 123  * pool is imported.
 124  */
 125 uint_t zfs_multihost_fail_intervals = MMP_DEFAULT_FAIL_INTERVALS;
 126
 127 char *mmp_tag = "mmp_write_uberblock";
 128 static void mmp_thread(void *arg);
 129
 130 void
 131 mmp_init(spa_t *spa)
 132 {
 133         mmp_thread_t *mmp = &spa->spa_mmp;
 134
 135         mutex_init(&mmp->mmp_thread_lock, NULL, MUTEX_DEFAULT, NULL);
 136         cv_init(&mmp->mmp_thread_cv, NULL, CV_DEFAULT, NULL);
 137         mutex_init(&mmp->mmp_io_lock, NULL, MUTEX_DEFAULT, NULL);
 138 }
 139
 140 void
 141 mmp_fini(spa_t *spa)
 142 {
 143         mmp_thread_t *mmp = &spa->spa_mmp;
 144
 145         mutex_destroy(&mmp->mmp_thread_lock);
 146         cv_destroy(&mmp->mmp_thread_cv);
 147         mutex_destroy(&mmp->mmp_io_lock);
 148 }
 149
 150 static void
 151 mmp_thread_enter(mmp_thread_t *mmp, callb_cpr_t *cpr)
 152 {
 153         CALLB_CPR_INIT(cpr, &mmp->mmp_thread_lock, callb_generic_cpr, FTAG);
 154         mutex_enter(&mmp->mmp_thread_lock);
 155 }
 156
 157 static void
 158 mmp_thread_exit(mmp_thread_t *mmp, kthread_t **mpp, callb_cpr_t *cpr)
 159 {
 160         ASSERT(*mpp != NULL);
 161         *mpp = NULL;
 162         cv_broadcast(&mmp->mmp_thread_cv);
 163         CALLB_CPR_EXIT(cpr);            /* drops &mmp->mmp_thread_lock */
 164         thread_exit();
 165 }
 166
 167 void
 168 mmp_thread_start(spa_t *spa)
 169 {
 170         mmp_thread_t *mmp = &spa->spa_mmp;
 171
 172         if (spa_writeable(spa)) {
 173                 mutex_enter(&mmp->mmp_thread_lock);
 174                 if (!mmp->mmp_thread) {
 175                         dprintf("mmp_thread_start pool %s\n",
 176                             spa->spa_name);
 177                         mmp->mmp_thread = thread_create(NULL, 0, mmp_thread,
 178                             spa, 0, &p0, TS_RUN, defclsyspri);
 179                 }
 180                 mutex_exit(&mmp->mmp_thread_lock);
 181         }
 182 }
 183
 184 void
 185 mmp_thread_stop(spa_t *spa)
 186 {
 187         mmp_thread_t *mmp = &spa->spa_mmp;
 188
 189         mutex_enter(&mmp->mmp_thread_lock);
 190         mmp->mmp_thread_exiting = 1;
 191         cv_broadcast(&mmp->mmp_thread_cv);
 192
 193         while (mmp->mmp_thread) {
 194                 cv_wait(&mmp->mmp_thread_cv, &mmp->mmp_thread_lock);
 195         }
 196         mutex_exit(&mmp->mmp_thread_lock);
 197
 198         ASSERT(mmp->mmp_thread == NULL);
 199         mmp->mmp_thread_exiting = 0;
 200 }
 201
 202 /*
 203  * Choose a leaf vdev to write an MMP block to.  It must not have an
 204  * outstanding mmp write (if so then there is a problem, and a new write will
 205  * also block).  If there is no usable leaf in this subtree return NULL,
 206  * otherwise return a pointer to the leaf.
 207  *
 208  * When walking the subtree, a random child is chosen as the starting point so
 209  * that when the tree is healthy, the leaf chosen will be random with even
 210  * distribution.  If there are unhealthy vdevs in the tree, the distribution
 211  * will be really poor only if a large proportion of the vdevs are unhealthy,
 212  * in which case there are other more pressing problems.
 213  */
 214 static vdev_t *
 215 mmp_random_leaf(vdev_t *vd)
 216 {
 217         int child_idx;
 218
 219         if (!vdev_writeable(vd))
 220                 return (NULL);
 221
 222         if (vd->vdev_ops->vdev_op_leaf)
 223                 return (vd->vdev_mmp_pending == 0 ? vd : NULL);
 224
 225         child_idx = spa_get_random(vd->vdev_children);
 226         for (int offset = vd->vdev_children; offset > 0; offset--) {
 227                 vdev_t *leaf;
 228                 vdev_t *child = vd->vdev_child[(child_idx + offset) %
 229                     vd->vdev_children];
 230
 231                 leaf = mmp_random_leaf(child);
 232                 if (leaf)
 233                         return (leaf);
 234         }
 235
 236         return (NULL);
 237 }
 238
 239 static void
 240 mmp_write_done(zio_t *zio)
 241 {
 242         spa_t *spa = zio->io_spa;
 243         vdev_t *vd = zio->io_vd;
 244         mmp_thread_t *mts = zio->io_private;
 245
 246         mutex_enter(&mts->mmp_io_lock);
 247         vd->vdev_mmp_pending = 0;
 248
 249         if (zio->io_error)
 250                 goto unlock;
 251
 252         /*
 253          * Mmp writes are queued on a fixed schedule, but under many
 254          * circumstances, such as a busy device or faulty hardware,
 255          * the writes will complete at variable, much longer,
 256          * intervals.  In these cases, another node checking for
 257          * activity must wait longer to account for these delays.
 258          *
 259          * The mmp_delay is calculated as a decaying average of the interval
 260          * between completed mmp writes.  This is used to predict how long
 261          * the import must wait to detect activity in the pool, before
 262          * concluding it is not in use.
 263          *
 264          * Do not set mmp_delay if the multihost property is not on,
 265          * so as not to trigger an activity check on import.
 266          */
 267         if (spa_multihost(spa)) {
 268                 hrtime_t delay = gethrtime() - mts->mmp_last_write;
 269
 270                 if (delay > mts->mmp_delay)
 271                         mts->mmp_delay = delay;
 272                 else
 273                         mts->mmp_delay = (delay + mts->mmp_delay * 127) /
 274                             128;
 275         } else {
 276                 mts->mmp_delay = 0;
 277         }
 278         mts->mmp_last_write = gethrtime();
 279
 280 unlock:
 281         mutex_exit(&mts->mmp_io_lock);
 282         spa_config_exit(spa, SCL_STATE, mmp_tag);
 283
 284         abd_free(zio->io_abd);
 285 }
 286
 287 /*
 288  * When the uberblock on-disk is updated by a spa_sync,
 289  * creating a new "best" uberblock, update the one stored
 290  * in the mmp thread state, used for mmp writes.
 291  */
 292 void
 293 mmp_update_uberblock(spa_t *spa, uberblock_t *ub)
 294 {
 295         mmp_thread_t *mmp = &spa->spa_mmp;
 296
 297         mutex_enter(&mmp->mmp_io_lock);
 298         mmp->mmp_ub = *ub;
 299         mmp->mmp_ub.ub_timestamp = gethrestime_sec();
 300         mutex_exit(&mmp->mmp_io_lock);
 301 }
 302
 303 /*
 304  * Choose a random vdev, label, and MMP block, and write over it
 305  * with a copy of the last-synced uberblock, whose timestamp
 306  * has been updated to reflect that the pool is in use.
 307  */
 308 static void
 309 mmp_write_uberblock(spa_t *spa)
 310 {
 311         int flags = ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_CANFAIL;
 312         mmp_thread_t *mmp = &spa->spa_mmp;
 313         uberblock_t *ub;
 314         vdev_t *vd;
 315         int label;
 316         uint64_t offset;
 317
 318         spa_config_enter(spa, SCL_STATE, mmp_tag, RW_READER);
 319         vd = mmp_random_leaf(spa->spa_root_vdev);
 320         if (vd == NULL) {
 321                 spa_config_exit(spa, SCL_STATE, FTAG);
 322                 return;
 323         }
 324
 325         mutex_enter(&mmp->mmp_io_lock);
 326
 327         if (mmp->mmp_zio_root == NULL)
 328                 mmp->mmp_zio_root = zio_root(spa, NULL, NULL,
 329                     flags | ZIO_FLAG_GODFATHER);
 330
 331         ub = &mmp->mmp_ub;
 332         ub->ub_timestamp = gethrestime_sec();
 333         ub->ub_mmp_magic = MMP_MAGIC;
 334         ub->ub_mmp_delay = mmp->mmp_delay;
 335         vd->vdev_mmp_pending = gethrtime();
 336
 337         zio_t *zio  = zio_null(mmp->mmp_zio_root, spa, NULL, NULL, NULL, flags);
 338         abd_t *ub_abd = abd_alloc_for_io(VDEV_UBERBLOCK_SIZE(vd), B_TRUE);
 339         abd_zero(ub_abd, VDEV_UBERBLOCK_SIZE(vd));
 340         abd_copy_from_buf(ub_abd, ub, sizeof (uberblock_t));
 341
 342         mutex_exit(&mmp->mmp_io_lock);
 343
 344         offset = VDEV_UBERBLOCK_OFFSET(vd, VDEV_UBERBLOCK_COUNT(vd) -
 345             MMP_BLOCKS_PER_LABEL + spa_get_random(MMP_BLOCKS_PER_LABEL));
 346
 347         label = spa_get_random(VDEV_LABELS);
 348         vdev_label_write(zio, vd, label, ub_abd, offset,
 349             VDEV_UBERBLOCK_SIZE(vd), mmp_write_done, mmp,
 350             flags | ZIO_FLAG_DONT_PROPAGATE);
 351
 352         spa_mmp_history_add(ub->ub_txg, ub->ub_timestamp, ub->ub_mmp_delay, vd,
 353             label);
 354
 355         zio_nowait(zio);
 356 }
 357
 358 static void
 359 mmp_thread(void *arg)
 360 {
 361         spa_t *spa = (spa_t *)arg;
 362         mmp_thread_t *mmp = &spa->spa_mmp;
 363         boolean_t last_spa_suspended = spa_suspended(spa);
 364         boolean_t last_spa_multihost = spa_multihost(spa);
 365         callb_cpr_t cpr;
 366         hrtime_t max_fail_ns = zfs_multihost_fail_intervals *
 367             MSEC2NSEC(MAX(zfs_multihost_interval, MMP_MIN_INTERVAL));
 368
 369         mmp_thread_enter(mmp, &cpr);
 370
 371         /*
 372          * The mmp_write_done() function calculates mmp_delay based on the
 373          * prior value of mmp_delay and the elapsed time since the last write.
 374          * For the first mmp write, there is no "last write", so we start
 375          * with fake, but reasonable, default non-zero values.
 376          */
 377         mmp->mmp_delay = MSEC2NSEC(MAX(zfs_multihost_interval,
 378             MMP_MIN_INTERVAL)) / MAX(vdev_count_leaves(spa), 1);
 379         mmp->mmp_last_write = gethrtime() - mmp->mmp_delay;
 380
 381         while (!mmp->mmp_thread_exiting) {
 382                 uint64_t mmp_fail_intervals = zfs_multihost_fail_intervals;
 383                 uint64_t mmp_interval = MSEC2NSEC(
 384                     MAX(zfs_multihost_interval, MMP_MIN_INTERVAL));
 385                 boolean_t suspended = spa_suspended(spa);
 386                 boolean_t multihost = spa_multihost(spa);
 387                 hrtime_t start, next_time;
 388
 389                 start = gethrtime();
 390                 if (multihost) {
 391                         next_time = start + mmp_interval /
 392                             MAX(vdev_count_leaves(spa), 1);
 393                 } else {
 394                         next_time = start + MSEC2NSEC(MMP_DEFAULT_INTERVAL);
 395                 }
 396
 397                 /*
 398                  * When MMP goes off => on, or spa goes suspended =>
 399                  * !suspended, we know no writes occurred recently.  We
 400                  * update mmp_last_write to give us some time to try.
 401                  */
 402                 if ((!last_spa_multihost && multihost) ||
 403                     (last_spa_suspended && !suspended)) {
 404                         mutex_enter(&mmp->mmp_io_lock);
 405                         mmp->mmp_last_write = gethrtime();
 406                         mutex_exit(&mmp->mmp_io_lock);
 407                 } else if (last_spa_multihost && !multihost) {
 408                         mutex_enter(&mmp->mmp_io_lock);
 409                         mmp->mmp_delay = 0;
 410                         mutex_exit(&mmp->mmp_io_lock);
 411                 }
 412                 last_spa_multihost = multihost;
 413                 last_spa_suspended = suspended;
 414
 415                 /*
 416                  * Smooth max_fail_ns when its factors are decreased, because
 417                  * making (max_fail_ns < mmp_interval) results in the pool being
 418                  * immediately suspended before writes can occur at the new
 419                  * higher frequency.
 420                  */
 421                 if ((mmp_interval * mmp_fail_intervals) < max_fail_ns) {
 422                         max_fail_ns = ((31 * max_fail_ns) + (mmp_interval *
 423                             mmp_fail_intervals)) / 32;
 424                 } else {
 425                         max_fail_ns = mmp_interval * mmp_fail_intervals;
 426                 }
 427
 428                 /*
 429                  * Suspend the pool if no MMP write has succeeded in over
 430                  * mmp_interval * mmp_fail_intervals nanoseconds.
 431                  */
 432                 if (!suspended && mmp_fail_intervals && multihost &&
 433                     (start - mmp->mmp_last_write) > max_fail_ns) {
 434                         cmn_err(CE_WARN, "MMP writes to pool '%s' have not "
 435                             "succeeded in over %llus; suspending pool",
 436                             spa_name(spa),
 437                             NSEC2SEC(start - mmp->mmp_last_write));
 438                         zio_suspend(spa, NULL);
 439                 }
 440
 441                 if (multihost)
 442                         mmp_write_uberblock(spa);
 443
 444                 CALLB_CPR_SAFE_BEGIN(&cpr);
 445                 (void) cv_timedwait_sig(&mmp->mmp_thread_cv,
 446                     &mmp->mmp_thread_lock, ddi_get_lbolt() +
 447                     ((next_time - gethrtime()) / (NANOSEC / hz)));
 448                 CALLB_CPR_SAFE_END(&cpr, &mmp->mmp_thread_lock);
 449         }
 450
 451         /* Outstanding writes are allowed to complete. */
 452         if (mmp->mmp_zio_root)
 453                 zio_wait(mmp->mmp_zio_root);
 454
 455         mmp->mmp_zio_root = NULL;
 456         mmp_thread_exit(mmp, &mmp->mmp_thread, &cpr);
 457 }
 458
 459 /*
 460  * Signal the MMP thread to wake it, when it is sleeping on
 461  * its cv.  Used when some module parameter has changed and
 462  * we want the thread to know about it.
 463  * Only signal if the pool is active and mmp thread is
 464  * running, otherwise there is no thread to wake.
 465  */
 466 static void
 467 mmp_signal_thread(spa_t *spa)
 468 {
 469         mmp_thread_t *mmp = &spa->spa_mmp;
 470
 471         mutex_enter(&mmp->mmp_thread_lock);
 472         if (mmp->mmp_thread)
 473                 cv_broadcast(&mmp->mmp_thread_cv);
 474         mutex_exit(&mmp->mmp_thread_lock);
 475 }
 476
 477 void
 478 mmp_signal_all_threads(void)
 479 {
 480         spa_t *spa = NULL;
 481
 482         mutex_enter(&spa_namespace_lock);
 483         while ((spa = spa_next(spa))) {
 484                 if (spa->spa_state == POOL_STATE_ACTIVE)
 485                         mmp_signal_thread(spa);
 486         }
 487         mutex_exit(&spa_namespace_lock);
 488 }
 489
 490 #if defined(_KERNEL) && defined(HAVE_SPL)
 491 #include <linux/mod_compat.h>
 492
 493 static int
 494 param_set_multihost_interval(const char *val, zfs_kernel_param_t *kp)
 495 {
 496         int ret;
 497
 498         ret = param_set_ulong(val, kp);
 499         if (ret < 0)
 500                 return (ret);
 501
 502         mmp_signal_all_threads();
 503
 504         return (ret);
 505 }
 506
 507 /* BEGIN CSTYLED */
 508 module_param(zfs_multihost_fail_intervals, uint, 0644);
 509 MODULE_PARM_DESC(zfs_multihost_fail_intervals,
 510         "Max allowed period without a successful mmp write");
 511
 512 module_param_call(zfs_multihost_interval, param_set_multihost_interval,
 513     param_get_ulong, &zfs_multihost_interval, 0644);
 514 MODULE_PARM_DESC(zfs_multihost_interval,
 515         "Milliseconds between mmp writes to each leaf");
 516
 517 module_param(zfs_multihost_import_intervals, uint, 0644);
 518 MODULE_PARM_DESC(zfs_multihost_import_intervals,
 519         "Number of zfs_multihost_interval periods to wait for activity");
 520 /* END CSTYLED */
 521 #endif