module/zfs/mmp.c

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright (c) 2017 by Lawrence Livermore National Security, LLC.
  23  */
  24
  25 #include <sys/abd.h>
  26 #include <sys/mmp.h>
  27 #include <sys/spa.h>
  28 #include <sys/spa_impl.h>
  29 #include <sys/time.h>
  30 #include <sys/vdev.h>
  31 #include <sys/vdev_impl.h>
  32 #include <sys/zfs_context.h>
  33 #include <sys/callb.h>
  34
  35 /*
  36  * Multi-Modifier Protection (MMP) attempts to prevent a user from importing
  37  * or opening a pool on more than one host at a time.  In particular, it
  38  * prevents "zpool import -f" on a host from succeeding while the pool is
  39  * already imported on another host.  There are many other ways in which a
  40  * device could be used by two hosts for different purposes at the same time
  41  * resulting in pool damage.  This implementation does not attempt to detect
  42  * those cases.
  43  *
  44  * MMP operates by ensuring there are frequent visible changes on disk (a
  45  * "heartbeat") at all times.  And by altering the import process to check
  46  * for these changes and failing the import when they are detected.  This
  47  * functionality is enabled by setting the 'multihost' pool property to on.
  48  *
  49  * Uberblocks written by the txg_sync thread always go into the first
  50  * (N-MMP_BLOCKS_PER_LABEL) slots, the remaining slots are reserved for MMP.
  51  * They are used to hold uberblocks which are exactly the same as the last
  52  * synced uberblock except that the ub_timestamp is frequently updated.
  53  * Like all other uberblocks, the slot is written with an embedded checksum,
  54  * and slots with invalid checksums are ignored.  This provides the
  55  * "heartbeat", with no risk of overwriting good uberblocks that must be
  56  * preserved, e.g. previous txgs and associated block pointers.
  57  *
  58  * Two optional fields are added to uberblock structure: ub_mmp_magic and
  59  * ub_mmp_delay.  The magic field allows zfs to tell whether ub_mmp_delay is
  60  * valid.  The delay field is a decaying average of the amount of time between
  61  * completion of successive MMP writes, in nanoseconds.  It is used to predict
  62  * how long the import must wait to detect activity in the pool, before
  63  * concluding it is not in use.
  64  *
  65  * During import an activity test may now be performed to determine if
  66  * the pool is in use.  The activity test is typically required if the
  67  * ZPOOL_CONFIG_HOSTID does not match the system hostid, the pool state is
  68  * POOL_STATE_ACTIVE, and the pool is not a root pool.
  69  *
  70  * The activity test finds the "best" uberblock (highest txg & timestamp),
  71  * waits some time, and then finds the "best" uberblock again.  If the txg
  72  * and timestamp in both "best" uberblocks do not match, the pool is in use
  73  * by another host and the import fails.  Since the granularity of the
  74  * timestamp is in seconds this activity test must take a bare minimum of one
  75  * second.  In order to assure the accuracy of the activity test, the default
  76  * values result in an activity test duration of 10x the mmp write interval.
  77  *
  78  * The "zpool import"  activity test can be expected to take a minimum time of
  79  * zfs_multihost_import_intervals * zfs_multihost_interval milliseconds.  If the
  80  * "best" uberblock has a valid ub_mmp_delay field, then the duration of the
  81  * test may take longer if MMP writes were occurring less frequently than
  82  * expected.  Additionally, the duration is then extended by a random 25% to
  83  * attempt to to detect simultaneous imports.  For example, if both partner
  84  * hosts are rebooted at the same time and automatically attempt to import the
  85  * pool.
  86  */
  87
  88 /*
  89  * Used to control the frequency of mmp writes which are performed when the
  90  * 'multihost' pool property is on.  This is one factor used to determine the
  91  * length of the activity check during import.
  92  *
  93  * The mmp write period is zfs_multihost_interval / leaf-vdevs milliseconds.
  94  * This means that on average an mmp write will be issued for each leaf vdev
  95  * every zfs_multihost_interval milliseconds.  In practice, the observed period
  96  * can vary with the I/O load and this observed value is the delay which is
  97  * stored in the uberblock.  The minimum allowed value is 100 ms.
  98  */
  99 ulong_t zfs_multihost_interval = MMP_DEFAULT_INTERVAL;
 100
 101 /*
 102  * Used to control the duration of the activity test on import.  Smaller values
 103  * of zfs_multihost_import_intervals will reduce the import time but increase
 104  * the risk of failing to detect an active pool.  The total activity check time
 105  * is never allowed to drop below one second.  A value of 0 is ignored and
 106  * treated as if it was set to 1.
 107  */
 108 uint_t zfs_multihost_import_intervals = MMP_DEFAULT_IMPORT_INTERVALS;
 109
 110 /*
 111  * Controls the behavior of the pool when mmp write failures are detected.
 112  *
 113  * When zfs_multihost_fail_intervals = 0 then mmp write failures are ignored.
 114  * The failures will still be reported to the ZED which depending on its
 115  * configuration may take action such as suspending the pool or taking a
 116  * device offline.
 117  *
 118  * When zfs_multihost_fail_intervals > 0 then sequential mmp write failures will
 119  * cause the pool to be suspended.  This occurs when
 120  * zfs_multihost_fail_intervals * zfs_multihost_interval milliseconds have
 121  * passed since the last successful mmp write.  This guarantees the activity
 122  * test will see mmp writes if the
 123  * pool is imported.
 124  */
 125 uint_t zfs_multihost_fail_intervals = MMP_DEFAULT_FAIL_INTERVALS;
 126
 127 static void mmp_thread(spa_t *spa);
 128 char *mmp_tag = "mmp_write_uberblock";
 129
 130 void
 131 mmp_init(spa_t *spa)
 132 {
 133         mmp_thread_t *mmp = &spa->spa_mmp;
 134
 135         mutex_init(&mmp->mmp_thread_lock, NULL, MUTEX_DEFAULT, NULL);
 136         cv_init(&mmp->mmp_thread_cv, NULL, CV_DEFAULT, NULL);
 137         mutex_init(&mmp->mmp_io_lock, NULL, MUTEX_DEFAULT, NULL);
 138         mmp->mmp_kstat_id = 1;
 139 }
 140
 141 void
 142 mmp_fini(spa_t *spa)
 143 {
 144         mmp_thread_t *mmp = &spa->spa_mmp;
 145
 146         mutex_destroy(&mmp->mmp_thread_lock);
 147         cv_destroy(&mmp->mmp_thread_cv);
 148         mutex_destroy(&mmp->mmp_io_lock);
 149 }
 150
 151 static void
 152 mmp_thread_enter(mmp_thread_t *mmp, callb_cpr_t *cpr)
 153 {
 154         CALLB_CPR_INIT(cpr, &mmp->mmp_thread_lock, callb_generic_cpr, FTAG);
 155         mutex_enter(&mmp->mmp_thread_lock);
 156 }
 157
 158 static void
 159 mmp_thread_exit(mmp_thread_t *mmp, kthread_t **mpp, callb_cpr_t *cpr)
 160 {
 161         ASSERT(*mpp != NULL);
 162         *mpp = NULL;
 163         cv_broadcast(&mmp->mmp_thread_cv);
 164         CALLB_CPR_EXIT(cpr);            /* drops &mmp->mmp_thread_lock */
 165         thread_exit();
 166 }
 167
 168 void
 169 mmp_thread_start(spa_t *spa)
 170 {
 171         mmp_thread_t *mmp = &spa->spa_mmp;
 172
 173         if (spa_writeable(spa)) {
 174                 mutex_enter(&mmp->mmp_thread_lock);
 175                 if (!mmp->mmp_thread) {
 176                         dprintf("mmp_thread_start pool %s\n",
 177                             spa->spa_name);
 178                         mmp->mmp_thread = thread_create(NULL, 0, mmp_thread,
 179                             spa, 0, &p0, TS_RUN, defclsyspri);
 180                 }
 181                 mutex_exit(&mmp->mmp_thread_lock);
 182         }
 183 }
 184
 185 void
 186 mmp_thread_stop(spa_t *spa)
 187 {
 188         mmp_thread_t *mmp = &spa->spa_mmp;
 189
 190         mutex_enter(&mmp->mmp_thread_lock);
 191         mmp->mmp_thread_exiting = 1;
 192         cv_broadcast(&mmp->mmp_thread_cv);
 193
 194         while (mmp->mmp_thread) {
 195                 cv_wait(&mmp->mmp_thread_cv, &mmp->mmp_thread_lock);
 196         }
 197         mutex_exit(&mmp->mmp_thread_lock);
 198
 199         ASSERT(mmp->mmp_thread == NULL);
 200         mmp->mmp_thread_exiting = 0;
 201 }
 202
 203 /*
 204  * Choose a leaf vdev to write an MMP block to.  It must not have an
 205  * outstanding mmp write (if so then there is a problem, and a new write will
 206  * also block).  If there is no usable leaf in this subtree return NULL,
 207  * otherwise return a pointer to the leaf.
 208  *
 209  * When walking the subtree, a random child is chosen as the starting point so
 210  * that when the tree is healthy, the leaf chosen will be random with even
 211  * distribution.  If there are unhealthy vdevs in the tree, the distribution
 212  * will be really poor only if a large proportion of the vdevs are unhealthy,
 213  * in which case there are other more pressing problems.
 214  */
 215 static vdev_t *
 216 mmp_random_leaf(vdev_t *vd)
 217 {
 218         int child_idx;
 219
 220         if (!vdev_writeable(vd))
 221                 return (NULL);
 222
 223         if (vd->vdev_ops->vdev_op_leaf)
 224                 return (vd->vdev_mmp_pending == 0 ? vd : NULL);
 225
 226         child_idx = spa_get_random(vd->vdev_children);
 227         for (int offset = vd->vdev_children; offset > 0; offset--) {
 228                 vdev_t *leaf;
 229                 vdev_t *child = vd->vdev_child[(child_idx + offset) %
 230                     vd->vdev_children];
 231
 232                 leaf = mmp_random_leaf(child);
 233                 if (leaf)
 234                         return (leaf);
 235         }
 236
 237         return (NULL);
 238 }
 239
 240 static void
 241 mmp_write_done(zio_t *zio)
 242 {
 243         spa_t *spa = zio->io_spa;
 244         vdev_t *vd = zio->io_vd;
 245         mmp_thread_t *mts = zio->io_private;
 246
 247         mutex_enter(&mts->mmp_io_lock);
 248         uint64_t mmp_kstat_id = vd->vdev_mmp_kstat_id;
 249         hrtime_t mmp_write_duration = gethrtime() - vd->vdev_mmp_pending;
 250
 251         if (zio->io_error)
 252                 goto unlock;
 253
 254         /*
 255          * Mmp writes are queued on a fixed schedule, but under many
 256          * circumstances, such as a busy device or faulty hardware,
 257          * the writes will complete at variable, much longer,
 258          * intervals.  In these cases, another node checking for
 259          * activity must wait longer to account for these delays.
 260          *
 261          * The mmp_delay is calculated as a decaying average of the interval
 262          * between completed mmp writes.  This is used to predict how long
 263          * the import must wait to detect activity in the pool, before
 264          * concluding it is not in use.
 265          *
 266          * Do not set mmp_delay if the multihost property is not on,
 267          * so as not to trigger an activity check on import.
 268          */
 269         if (spa_multihost(spa)) {
 270                 hrtime_t delay = gethrtime() - mts->mmp_last_write;
 271
 272                 if (delay > mts->mmp_delay)
 273                         mts->mmp_delay = delay;
 274                 else
 275                         mts->mmp_delay = (delay + mts->mmp_delay * 127) /
 276                             128;
 277         } else {
 278                 mts->mmp_delay = 0;
 279         }
 280         mts->mmp_last_write = gethrtime();
 281
 282 unlock:
 283         vd->vdev_mmp_pending = 0;
 284         vd->vdev_mmp_kstat_id = 0;
 285
 286         mutex_exit(&mts->mmp_io_lock);
 287         spa_config_exit(spa, SCL_STATE, mmp_tag);
 288
 289         spa_mmp_history_set(spa, mmp_kstat_id, zio->io_error,
 290             mmp_write_duration);
 291
 292         abd_free(zio->io_abd);
 293 }
 294
 295 /*
 296  * When the uberblock on-disk is updated by a spa_sync,
 297  * creating a new "best" uberblock, update the one stored
 298  * in the mmp thread state, used for mmp writes.
 299  */
 300 void
 301 mmp_update_uberblock(spa_t *spa, uberblock_t *ub)
 302 {
 303         mmp_thread_t *mmp = &spa->spa_mmp;
 304
 305         mutex_enter(&mmp->mmp_io_lock);
 306         mmp->mmp_ub = *ub;
 307         mmp->mmp_ub.ub_timestamp = gethrestime_sec();
 308         mutex_exit(&mmp->mmp_io_lock);
 309 }
 310
 311 /*
 312  * Choose a random vdev, label, and MMP block, and write over it
 313  * with a copy of the last-synced uberblock, whose timestamp
 314  * has been updated to reflect that the pool is in use.
 315  */
 316 static void
 317 mmp_write_uberblock(spa_t *spa)
 318 {
 319         int flags = ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_CANFAIL;
 320         mmp_thread_t *mmp = &spa->spa_mmp;
 321         uberblock_t *ub;
 322         vdev_t *vd;
 323         int label;
 324         uint64_t offset;
 325
 326         hrtime_t lock_acquire_time = gethrtime();
 327         spa_config_enter(spa, SCL_STATE, mmp_tag, RW_READER);
 328         lock_acquire_time = gethrtime() - lock_acquire_time;
 329         if (lock_acquire_time > (MSEC2NSEC(MMP_MIN_INTERVAL) / 10))
 330                 zfs_dbgmsg("SCL_STATE acquisition took %llu ns\n",
 331                     (u_longlong_t)lock_acquire_time);
 332
 333         vd = mmp_random_leaf(spa->spa_root_vdev);
 334         if (vd == NULL) {
 335                 spa_config_exit(spa, SCL_STATE, FTAG);
 336                 return;
 337         }
 338
 339         mutex_enter(&mmp->mmp_io_lock);
 340
 341         if (mmp->mmp_zio_root == NULL)
 342                 mmp->mmp_zio_root = zio_root(spa, NULL, NULL,
 343                     flags | ZIO_FLAG_GODFATHER);
 344
 345         ub = &mmp->mmp_ub;
 346         ub->ub_timestamp = gethrestime_sec();
 347         ub->ub_mmp_magic = MMP_MAGIC;
 348         ub->ub_mmp_delay = mmp->mmp_delay;
 349         vd->vdev_mmp_pending = gethrtime();
 350         vd->vdev_mmp_kstat_id = mmp->mmp_kstat_id++;
 351
 352         zio_t *zio  = zio_null(mmp->mmp_zio_root, spa, NULL, NULL, NULL, flags);
 353         abd_t *ub_abd = abd_alloc_for_io(VDEV_UBERBLOCK_SIZE(vd), B_TRUE);
 354         abd_zero(ub_abd, VDEV_UBERBLOCK_SIZE(vd));
 355         abd_copy_from_buf(ub_abd, ub, sizeof (uberblock_t));
 356
 357         mutex_exit(&mmp->mmp_io_lock);
 358
 359         offset = VDEV_UBERBLOCK_OFFSET(vd, VDEV_UBERBLOCK_COUNT(vd) -
 360             MMP_BLOCKS_PER_LABEL + spa_get_random(MMP_BLOCKS_PER_LABEL));
 361
 362         label = spa_get_random(VDEV_LABELS);
 363         vdev_label_write(zio, vd, label, ub_abd, offset,
 364             VDEV_UBERBLOCK_SIZE(vd), mmp_write_done, mmp,
 365             flags | ZIO_FLAG_DONT_PROPAGATE);
 366
 367         spa_mmp_history_add(ub->ub_txg, ub->ub_timestamp, ub->ub_mmp_delay, vd,
 368             label, vd->vdev_mmp_kstat_id);
 369
 370         zio_nowait(zio);
 371 }
 372
 373 static void
 374 mmp_thread(spa_t *spa)
 375 {
 376         mmp_thread_t *mmp = &spa->spa_mmp;
 377         boolean_t last_spa_suspended = spa_suspended(spa);
 378         boolean_t last_spa_multihost = spa_multihost(spa);
 379         callb_cpr_t cpr;
 380         hrtime_t max_fail_ns = zfs_multihost_fail_intervals *
 381             MSEC2NSEC(MAX(zfs_multihost_interval, MMP_MIN_INTERVAL));
 382
 383         mmp_thread_enter(mmp, &cpr);
 384
 385         /*
 386          * The mmp_write_done() function calculates mmp_delay based on the
 387          * prior value of mmp_delay and the elapsed time since the last write.
 388          * For the first mmp write, there is no "last write", so we start
 389          * with fake, but reasonable, default non-zero values.
 390          */
 391         mmp->mmp_delay = MSEC2NSEC(MAX(zfs_multihost_interval,
 392             MMP_MIN_INTERVAL)) / MAX(vdev_count_leaves(spa), 1);
 393         mmp->mmp_last_write = gethrtime() - mmp->mmp_delay;
 394
 395         while (!mmp->mmp_thread_exiting) {
 396                 uint64_t mmp_fail_intervals = zfs_multihost_fail_intervals;
 397                 uint64_t mmp_interval = MSEC2NSEC(
 398                     MAX(zfs_multihost_interval, MMP_MIN_INTERVAL));
 399                 boolean_t suspended = spa_suspended(spa);
 400                 boolean_t multihost = spa_multihost(spa);
 401                 hrtime_t start, next_time;
 402
 403                 start = gethrtime();
 404                 if (multihost) {
 405                         next_time = start + mmp_interval /
 406                             MAX(vdev_count_leaves(spa), 1);
 407                 } else {
 408                         next_time = start + MSEC2NSEC(MMP_DEFAULT_INTERVAL);
 409                 }
 410
 411                 /*
 412                  * When MMP goes off => on, or spa goes suspended =>
 413                  * !suspended, we know no writes occurred recently.  We
 414                  * update mmp_last_write to give us some time to try.
 415                  */
 416                 if ((!last_spa_multihost && multihost) ||
 417                     (last_spa_suspended && !suspended)) {
 418                         mutex_enter(&mmp->mmp_io_lock);
 419                         mmp->mmp_last_write = gethrtime();
 420                         mutex_exit(&mmp->mmp_io_lock);
 421                 } else if (last_spa_multihost && !multihost) {
 422                         mutex_enter(&mmp->mmp_io_lock);
 423                         mmp->mmp_delay = 0;
 424                         mutex_exit(&mmp->mmp_io_lock);
 425                 }
 426                 last_spa_multihost = multihost;
 427                 last_spa_suspended = suspended;
 428
 429                 /*
 430                  * Smooth max_fail_ns when its factors are decreased, because
 431                  * making (max_fail_ns < mmp_interval) results in the pool being
 432                  * immediately suspended before writes can occur at the new
 433                  * higher frequency.
 434                  */
 435                 if ((mmp_interval * mmp_fail_intervals) < max_fail_ns) {
 436                         max_fail_ns = ((31 * max_fail_ns) + (mmp_interval *
 437                             mmp_fail_intervals)) / 32;
 438                 } else {
 439                         max_fail_ns = mmp_interval * mmp_fail_intervals;
 440                 }
 441
 442                 /*
 443                  * Suspend the pool if no MMP write has succeeded in over
 444                  * mmp_interval * mmp_fail_intervals nanoseconds.
 445                  */
 446                 if (!suspended && mmp_fail_intervals && multihost &&
 447                     (start - mmp->mmp_last_write) > max_fail_ns) {
 448                         cmn_err(CE_WARN, "MMP writes to pool '%s' have not "
 449                             "succeeded in over %llus; suspending pool",
 450                             spa_name(spa),
 451                             NSEC2SEC(start - mmp->mmp_last_write));
 452                         zio_suspend(spa, NULL);
 453                 }
 454
 455                 if (multihost && !suspended)
 456                         mmp_write_uberblock(spa);
 457
 458                 CALLB_CPR_SAFE_BEGIN(&cpr);
 459                 (void) cv_timedwait_sig(&mmp->mmp_thread_cv,
 460                     &mmp->mmp_thread_lock, ddi_get_lbolt() +
 461                     ((next_time - gethrtime()) / (NANOSEC / hz)));
 462                 CALLB_CPR_SAFE_END(&cpr, &mmp->mmp_thread_lock);
 463         }
 464
 465         /* Outstanding writes are allowed to complete. */
 466         if (mmp->mmp_zio_root)
 467                 zio_wait(mmp->mmp_zio_root);
 468
 469         mmp->mmp_zio_root = NULL;
 470         mmp_thread_exit(mmp, &mmp->mmp_thread, &cpr);
 471 }
 472
 473 /*
 474  * Signal the MMP thread to wake it, when it is sleeping on
 475  * its cv.  Used when some module parameter has changed and
 476  * we want the thread to know about it.
 477  * Only signal if the pool is active and mmp thread is
 478  * running, otherwise there is no thread to wake.
 479  */
 480 static void
 481 mmp_signal_thread(spa_t *spa)
 482 {
 483         mmp_thread_t *mmp = &spa->spa_mmp;
 484
 485         mutex_enter(&mmp->mmp_thread_lock);
 486         if (mmp->mmp_thread)
 487                 cv_broadcast(&mmp->mmp_thread_cv);
 488         mutex_exit(&mmp->mmp_thread_lock);
 489 }
 490
 491 void
 492 mmp_signal_all_threads(void)
 493 {
 494         spa_t *spa = NULL;
 495
 496         mutex_enter(&spa_namespace_lock);
 497         while ((spa = spa_next(spa))) {
 498                 if (spa->spa_state == POOL_STATE_ACTIVE)
 499                         mmp_signal_thread(spa);
 500         }
 501         mutex_exit(&spa_namespace_lock);
 502 }
 503
 504 #if defined(_KERNEL) && defined(HAVE_SPL)
 505 #include <linux/mod_compat.h>
 506
 507 static int
 508 param_set_multihost_interval(const char *val, zfs_kernel_param_t *kp)
 509 {
 510         int ret;
 511
 512         ret = param_set_ulong(val, kp);
 513         if (ret < 0)
 514                 return (ret);
 515
 516         mmp_signal_all_threads();
 517
 518         return (ret);
 519 }
 520
 521 /* BEGIN CSTYLED */
 522 module_param(zfs_multihost_fail_intervals, uint, 0644);
 523 MODULE_PARM_DESC(zfs_multihost_fail_intervals,
 524         "Max allowed period without a successful mmp write");
 525
 526 module_param_call(zfs_multihost_interval, param_set_multihost_interval,
 527     param_get_ulong, &zfs_multihost_interval, 0644);
 528 MODULE_PARM_DESC(zfs_multihost_interval,
 529         "Milliseconds between mmp writes to each leaf");
 530
 531 module_param(zfs_multihost_import_intervals, uint, 0644);
 532 MODULE_PARM_DESC(zfs_multihost_import_intervals,
 533         "Number of zfs_multihost_interval periods to wait for activity");
 534 /* END CSTYLED */
 535 #endif