4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
22 * Copyright (c) 2017 by Lawrence Livermore National Security, LLC.
28 #include <sys/spa_impl.h>
31 #include <sys/vdev_impl.h>
32 #include <sys/zfs_context.h>
33 #include <sys/callb.h>
36 * Multi-Modifier Protection (MMP) attempts to prevent a user from importing
37 * or opening a pool on more than one host at a time. In particular, it
38 * prevents "zpool import -f" on a host from succeeding while the pool is
39 * already imported on another host. There are many other ways in which a
40 * device could be used by two hosts for different purposes at the same time
41 * resulting in pool damage. This implementation does not attempt to detect
44 * MMP operates by ensuring there are frequent visible changes on disk (a
45 * "heartbeat") at all times. And by altering the import process to check
46 * for these changes and failing the import when they are detected. This
47 * functionality is enabled by setting the 'multihost' pool property to on.
49 * Uberblocks written by the txg_sync thread always go into the first
50 * (N-MMP_BLOCKS_PER_LABEL) slots, the remaining slots are reserved for MMP.
51 * They are used to hold uberblocks which are exactly the same as the last
52 * synced uberblock except that the ub_timestamp is frequently updated.
53 * Like all other uberblocks, the slot is written with an embedded checksum,
54 * and slots with invalid checksums are ignored. This provides the
55 * "heartbeat", with no risk of overwriting good uberblocks that must be
56 * preserved, e.g. previous txgs and associated block pointers.
58 * Two optional fields are added to uberblock structure: ub_mmp_magic and
59 * ub_mmp_delay. The magic field allows zfs to tell whether ub_mmp_delay is
60 * valid. The delay field is a decaying average of the amount of time between
61 * completion of successive MMP writes, in nanoseconds. It is used to predict
62 * how long the import must wait to detect activity in the pool, before
63 * concluding it is not in use.
65 * During import an activity test may now be performed to determine if
66 * the pool is in use. The activity test is typically required if the
67 * ZPOOL_CONFIG_HOSTID does not match the system hostid, the pool state is
68 * POOL_STATE_ACTIVE, and the pool is not a root pool.
70 * The activity test finds the "best" uberblock (highest txg & timestamp),
71 * waits some time, and then finds the "best" uberblock again. If the txg
72 * and timestamp in both "best" uberblocks do not match, the pool is in use
73 * by another host and the import fails. Since the granularity of the
74 * timestamp is in seconds this activity test must take a bare minimum of one
75 * second. In order to assure the accuracy of the activity test, the default
76 * values result in an activity test duration of 10x the mmp write interval.
78 * The "zpool import" activity test can be expected to take a minimum time of
79 * zfs_multihost_import_intervals * zfs_multihost_interval milliseconds. If the
80 * "best" uberblock has a valid ub_mmp_delay field, then the duration of the
81 * test may take longer if MMP writes were occurring less frequently than
82 * expected. Additionally, the duration is then extended by a random 25% to
83 * attempt to to detect simultaneous imports. For example, if both partner
84 * hosts are rebooted at the same time and automatically attempt to import the
89 * Used to control the frequency of mmp writes which are performed when the
90 * 'multihost' pool property is on. This is one factor used to determine the
91 * length of the activity check during import.
93 * The mmp write period is zfs_multihost_interval / leaf-vdevs milliseconds.
94 * This means that on average an mmp write will be issued for each leaf vdev
95 * every zfs_multihost_interval milliseconds. In practice, the observed period
96 * can vary with the I/O load and this observed value is the delay which is
97 * stored in the uberblock. The minimum allowed value is 100 ms.
99 ulong_t zfs_multihost_interval
= MMP_DEFAULT_INTERVAL
;
102 * Used to control the duration of the activity test on import. Smaller values
103 * of zfs_multihost_import_intervals will reduce the import time but increase
104 * the risk of failing to detect an active pool. The total activity check time
105 * is never allowed to drop below one second. A value of 0 is ignored and
106 * treated as if it was set to 1.
108 uint_t zfs_multihost_import_intervals
= MMP_DEFAULT_IMPORT_INTERVALS
;
111 * Controls the behavior of the pool when mmp write failures are detected.
113 * When zfs_multihost_fail_intervals = 0 then mmp write failures are ignored.
114 * The failures will still be reported to the ZED which depending on its
115 * configuration may take action such as suspending the pool or taking a
118 * When zfs_multihost_fail_intervals > 0 then sequential mmp write failures will
119 * cause the pool to be suspended. This occurs when
120 * zfs_multihost_fail_intervals * zfs_multihost_interval milliseconds have
121 * passed since the last successful mmp write. This guarantees the activity
122 * test will see mmp writes if the
125 uint_t zfs_multihost_fail_intervals
= MMP_DEFAULT_FAIL_INTERVALS
;
127 static void mmp_thread(spa_t
*spa
);
128 char *mmp_tag
= "mmp_write_uberblock";
133 mmp_thread_t
*mmp
= &spa
->spa_mmp
;
135 mutex_init(&mmp
->mmp_thread_lock
, NULL
, MUTEX_DEFAULT
, NULL
);
136 cv_init(&mmp
->mmp_thread_cv
, NULL
, CV_DEFAULT
, NULL
);
137 mutex_init(&mmp
->mmp_io_lock
, NULL
, MUTEX_DEFAULT
, NULL
);
138 mmp
->mmp_kstat_id
= 1;
144 mmp_thread_t
*mmp
= &spa
->spa_mmp
;
146 mutex_destroy(&mmp
->mmp_thread_lock
);
147 cv_destroy(&mmp
->mmp_thread_cv
);
148 mutex_destroy(&mmp
->mmp_io_lock
);
152 mmp_thread_enter(mmp_thread_t
*mmp
, callb_cpr_t
*cpr
)
154 CALLB_CPR_INIT(cpr
, &mmp
->mmp_thread_lock
, callb_generic_cpr
, FTAG
);
155 mutex_enter(&mmp
->mmp_thread_lock
);
159 mmp_thread_exit(mmp_thread_t
*mmp
, kthread_t
**mpp
, callb_cpr_t
*cpr
)
161 ASSERT(*mpp
!= NULL
);
163 cv_broadcast(&mmp
->mmp_thread_cv
);
164 CALLB_CPR_EXIT(cpr
); /* drops &mmp->mmp_thread_lock */
169 mmp_thread_start(spa_t
*spa
)
171 mmp_thread_t
*mmp
= &spa
->spa_mmp
;
173 if (spa_writeable(spa
)) {
174 mutex_enter(&mmp
->mmp_thread_lock
);
175 if (!mmp
->mmp_thread
) {
176 dprintf("mmp_thread_start pool %s\n",
178 mmp
->mmp_thread
= thread_create(NULL
, 0, mmp_thread
,
179 spa
, 0, &p0
, TS_RUN
, defclsyspri
);
181 mutex_exit(&mmp
->mmp_thread_lock
);
186 mmp_thread_stop(spa_t
*spa
)
188 mmp_thread_t
*mmp
= &spa
->spa_mmp
;
190 mutex_enter(&mmp
->mmp_thread_lock
);
191 mmp
->mmp_thread_exiting
= 1;
192 cv_broadcast(&mmp
->mmp_thread_cv
);
194 while (mmp
->mmp_thread
) {
195 cv_wait(&mmp
->mmp_thread_cv
, &mmp
->mmp_thread_lock
);
197 mutex_exit(&mmp
->mmp_thread_lock
);
199 ASSERT(mmp
->mmp_thread
== NULL
);
200 mmp
->mmp_thread_exiting
= 0;
203 typedef enum mmp_vdev_state_flag
{
204 MMP_FAIL_NOT_WRITABLE
= (1 << 0),
205 MMP_FAIL_WRITE_PENDING
= (1 << 1),
206 } mmp_vdev_state_flag_t
;
209 mmp_random_leaf_impl(vdev_t
*vd
, int *fail_mask
)
213 if (!vdev_writeable(vd
)) {
214 *fail_mask
|= MMP_FAIL_NOT_WRITABLE
;
218 if (vd
->vdev_ops
->vdev_op_leaf
) {
221 if (vd
->vdev_mmp_pending
!= 0) {
222 *fail_mask
|= MMP_FAIL_WRITE_PENDING
;
231 child_idx
= spa_get_random(vd
->vdev_children
);
232 for (int offset
= vd
->vdev_children
; offset
> 0; offset
--) {
234 vdev_t
*child
= vd
->vdev_child
[(child_idx
+ offset
) %
237 leaf
= mmp_random_leaf_impl(child
, fail_mask
);
246 * Find a leaf vdev to write an MMP block to. It must not have an outstanding
247 * mmp write (if so a new write will also likely block). If there is no usable
248 * leaf in the tree rooted at in_vd, a nonzero error value is returned, and
249 * *out_vd is unchanged.
251 * The error value returned is a bit field.
253 * MMP_FAIL_WRITE_PENDING
254 * If set, one or more leaf vdevs are writeable, but have an MMP write which has
257 * MMP_FAIL_NOT_WRITABLE
258 * If set, one or more vdevs are not writeable. The children of those vdevs
261 * Assuming in_vd points to a tree, a random subtree will be chosen to start.
262 * That subtree, and successive ones, will be walked until a usable leaf has
263 * been found, or all subtrees have been examined (except that the children of
264 * un-writeable vdevs are not examined).
266 * If the leaf vdevs in the tree are healthy, the distribution of returned leaf
267 * vdevs will be even. If there are unhealthy leaves, the following leaves
268 * (child_index % index_children) will be chosen more often.
272 mmp_random_leaf(vdev_t
*in_vd
, vdev_t
**out_vd
)
275 vdev_t
*vd
= mmp_random_leaf_impl(in_vd
, &error_mask
);
284 * MMP writes are issued on a fixed schedule, but may complete at variable,
285 * much longer, intervals. The mmp_delay captures long periods between
286 * successful writes for any reason, including disk latency, scheduling delays,
289 * The mmp_delay is usually calculated as a decaying average, but if the latest
290 * delay is higher we do not average it, so that we do not hide sudden spikes
291 * which the importing host must wait for.
293 * If writes are occurring frequently, such as due to a high rate of txg syncs,
294 * the mmp_delay could become very small. Since those short delays depend on
295 * activity we cannot count on, we never allow mmp_delay to get lower than rate
296 * expected if only mmp_thread writes occur.
298 * If an mmp write was skipped or fails, and we have already waited longer than
299 * mmp_delay, we need to update it so the next write reflects the longer delay.
301 * Do not set mmp_delay if the multihost property is not on, so as not to
302 * trigger an activity check on import.
305 mmp_delay_update(spa_t
*spa
, boolean_t write_completed
)
307 mmp_thread_t
*mts
= &spa
->spa_mmp
;
308 hrtime_t delay
= gethrtime() - mts
->mmp_last_write
;
310 ASSERT(MUTEX_HELD(&mts
->mmp_io_lock
));
312 if (spa_multihost(spa
) == B_FALSE
) {
317 if (delay
> mts
->mmp_delay
)
318 mts
->mmp_delay
= delay
;
320 if (write_completed
== B_FALSE
)
323 mts
->mmp_last_write
= gethrtime();
326 * strictly less than, in case delay was changed above.
328 if (delay
< mts
->mmp_delay
) {
329 hrtime_t min_delay
= MSEC2NSEC(zfs_multihost_interval
) /
330 vdev_count_leaves(spa
);
331 mts
->mmp_delay
= MAX(((delay
+ mts
->mmp_delay
* 127) / 128),
337 mmp_write_done(zio_t
*zio
)
339 spa_t
*spa
= zio
->io_spa
;
340 vdev_t
*vd
= zio
->io_vd
;
341 mmp_thread_t
*mts
= zio
->io_private
;
343 mutex_enter(&mts
->mmp_io_lock
);
344 uint64_t mmp_kstat_id
= vd
->vdev_mmp_kstat_id
;
345 hrtime_t mmp_write_duration
= gethrtime() - vd
->vdev_mmp_pending
;
347 mmp_delay_update(spa
, (zio
->io_error
== 0));
349 vd
->vdev_mmp_pending
= 0;
350 vd
->vdev_mmp_kstat_id
= 0;
352 mutex_exit(&mts
->mmp_io_lock
);
353 spa_config_exit(spa
, SCL_STATE
, mmp_tag
);
355 spa_mmp_history_set(spa
, mmp_kstat_id
, zio
->io_error
,
358 abd_free(zio
->io_abd
);
362 * When the uberblock on-disk is updated by a spa_sync,
363 * creating a new "best" uberblock, update the one stored
364 * in the mmp thread state, used for mmp writes.
367 mmp_update_uberblock(spa_t
*spa
, uberblock_t
*ub
)
369 mmp_thread_t
*mmp
= &spa
->spa_mmp
;
371 mutex_enter(&mmp
->mmp_io_lock
);
373 mmp
->mmp_ub
.ub_timestamp
= gethrestime_sec();
374 mmp_delay_update(spa
, B_TRUE
);
375 mutex_exit(&mmp
->mmp_io_lock
);
379 * Choose a random vdev, label, and MMP block, and write over it
380 * with a copy of the last-synced uberblock, whose timestamp
381 * has been updated to reflect that the pool is in use.
384 mmp_write_uberblock(spa_t
*spa
)
386 int flags
= ZIO_FLAG_CONFIG_WRITER
| ZIO_FLAG_CANFAIL
;
387 mmp_thread_t
*mmp
= &spa
->spa_mmp
;
393 hrtime_t lock_acquire_time
= gethrtime();
394 spa_config_enter(spa
, SCL_STATE
, mmp_tag
, RW_READER
);
395 lock_acquire_time
= gethrtime() - lock_acquire_time
;
396 if (lock_acquire_time
> (MSEC2NSEC(MMP_MIN_INTERVAL
) / 10))
397 zfs_dbgmsg("SCL_STATE acquisition took %llu ns\n",
398 (u_longlong_t
)lock_acquire_time
);
400 error
= mmp_random_leaf(spa
->spa_root_vdev
, &vd
);
402 mutex_enter(&mmp
->mmp_io_lock
);
405 * spa_mmp_history has two types of entries:
406 * Issued MMP write: records time issued, error status, etc.
407 * Skipped MMP write: an MMP write could not be issued because no
408 * suitable leaf vdev was available. See comment above struct
409 * spa_mmp_history for details.
413 mmp_delay_update(spa
, B_FALSE
);
414 if (mmp
->mmp_skip_error
== error
) {
415 spa_mmp_history_set_skip(spa
, mmp
->mmp_kstat_id
- 1);
417 mmp
->mmp_skip_error
= error
;
418 spa_mmp_history_add(spa
, mmp
->mmp_ub
.ub_txg
,
419 gethrestime_sec(), mmp
->mmp_delay
, NULL
, 0,
420 mmp
->mmp_kstat_id
++, error
);
422 mutex_exit(&mmp
->mmp_io_lock
);
423 spa_config_exit(spa
, SCL_STATE
, FTAG
);
427 mmp
->mmp_skip_error
= 0;
429 if (mmp
->mmp_zio_root
== NULL
)
430 mmp
->mmp_zio_root
= zio_root(spa
, NULL
, NULL
,
431 flags
| ZIO_FLAG_GODFATHER
);
434 ub
->ub_timestamp
= gethrestime_sec();
435 ub
->ub_mmp_magic
= MMP_MAGIC
;
436 ub
->ub_mmp_delay
= mmp
->mmp_delay
;
437 vd
->vdev_mmp_pending
= gethrtime();
438 vd
->vdev_mmp_kstat_id
= mmp
->mmp_kstat_id
;
440 zio_t
*zio
= zio_null(mmp
->mmp_zio_root
, spa
, NULL
, NULL
, NULL
, flags
);
441 abd_t
*ub_abd
= abd_alloc_for_io(VDEV_UBERBLOCK_SIZE(vd
), B_TRUE
);
442 abd_zero(ub_abd
, VDEV_UBERBLOCK_SIZE(vd
));
443 abd_copy_from_buf(ub_abd
, ub
, sizeof (uberblock_t
));
446 mutex_exit(&mmp
->mmp_io_lock
);
448 offset
= VDEV_UBERBLOCK_OFFSET(vd
, VDEV_UBERBLOCK_COUNT(vd
) -
449 MMP_BLOCKS_PER_LABEL
+ spa_get_random(MMP_BLOCKS_PER_LABEL
));
451 label
= spa_get_random(VDEV_LABELS
);
452 vdev_label_write(zio
, vd
, label
, ub_abd
, offset
,
453 VDEV_UBERBLOCK_SIZE(vd
), mmp_write_done
, mmp
,
454 flags
| ZIO_FLAG_DONT_PROPAGATE
);
456 (void) spa_mmp_history_add(spa
, ub
->ub_txg
, ub
->ub_timestamp
,
457 ub
->ub_mmp_delay
, vd
, label
, vd
->vdev_mmp_kstat_id
, 0);
463 mmp_thread(spa_t
*spa
)
465 mmp_thread_t
*mmp
= &spa
->spa_mmp
;
466 boolean_t last_spa_suspended
= spa_suspended(spa
);
467 boolean_t last_spa_multihost
= spa_multihost(spa
);
469 hrtime_t max_fail_ns
= zfs_multihost_fail_intervals
*
470 MSEC2NSEC(MAX(zfs_multihost_interval
, MMP_MIN_INTERVAL
));
472 mmp_thread_enter(mmp
, &cpr
);
475 * The mmp_write_done() function calculates mmp_delay based on the
476 * prior value of mmp_delay and the elapsed time since the last write.
477 * For the first mmp write, there is no "last write", so we start
478 * with fake, but reasonable, default non-zero values.
480 mmp
->mmp_delay
= MSEC2NSEC(MAX(zfs_multihost_interval
,
481 MMP_MIN_INTERVAL
)) / MAX(vdev_count_leaves(spa
), 1);
482 mmp
->mmp_last_write
= gethrtime() - mmp
->mmp_delay
;
484 while (!mmp
->mmp_thread_exiting
) {
485 uint64_t mmp_fail_intervals
= zfs_multihost_fail_intervals
;
486 uint64_t mmp_interval
= MSEC2NSEC(
487 MAX(zfs_multihost_interval
, MMP_MIN_INTERVAL
));
488 boolean_t suspended
= spa_suspended(spa
);
489 boolean_t multihost
= spa_multihost(spa
);
493 next_time
= gethrtime() + mmp_interval
/
494 MAX(vdev_count_leaves(spa
), 1);
496 next_time
= gethrtime() +
497 MSEC2NSEC(MMP_DEFAULT_INTERVAL
);
500 * MMP off => on, or suspended => !suspended:
501 * No writes occurred recently. Update mmp_last_write to give
502 * us some time to try.
504 if ((!last_spa_multihost
&& multihost
) ||
505 (last_spa_suspended
&& !suspended
)) {
506 mutex_enter(&mmp
->mmp_io_lock
);
507 mmp
->mmp_last_write
= gethrtime();
508 mutex_exit(&mmp
->mmp_io_lock
);
513 * mmp_delay == 0 tells importing node to skip activity check.
515 if (last_spa_multihost
&& !multihost
) {
516 mutex_enter(&mmp
->mmp_io_lock
);
518 mutex_exit(&mmp
->mmp_io_lock
);
520 last_spa_multihost
= multihost
;
521 last_spa_suspended
= suspended
;
524 * Smooth max_fail_ns when its factors are decreased, because
525 * making (max_fail_ns < mmp_interval) results in the pool being
526 * immediately suspended before writes can occur at the new
529 if ((mmp_interval
* mmp_fail_intervals
) < max_fail_ns
) {
530 max_fail_ns
= ((31 * max_fail_ns
) + (mmp_interval
*
531 mmp_fail_intervals
)) / 32;
533 max_fail_ns
= mmp_interval
* mmp_fail_intervals
;
537 * Suspend the pool if no MMP write has succeeded in over
538 * mmp_interval * mmp_fail_intervals nanoseconds.
540 if (!suspended
&& mmp_fail_intervals
&& multihost
&&
541 (gethrtime() - mmp
->mmp_last_write
) > max_fail_ns
) {
542 cmn_err(CE_WARN
, "MMP writes to pool '%s' have not "
543 "succeeded in over %llus; suspending pool",
545 NSEC2SEC(gethrtime() - mmp
->mmp_last_write
));
546 zio_suspend(spa
, NULL
, ZIO_SUSPEND_MMP
);
549 if (multihost
&& !suspended
)
550 mmp_write_uberblock(spa
);
552 CALLB_CPR_SAFE_BEGIN(&cpr
);
553 (void) cv_timedwait_sig_hires(&mmp
->mmp_thread_cv
,
554 &mmp
->mmp_thread_lock
, next_time
, USEC2NSEC(1),
555 CALLOUT_FLAG_ABSOLUTE
);
556 CALLB_CPR_SAFE_END(&cpr
, &mmp
->mmp_thread_lock
);
559 /* Outstanding writes are allowed to complete. */
560 if (mmp
->mmp_zio_root
)
561 zio_wait(mmp
->mmp_zio_root
);
563 mmp
->mmp_zio_root
= NULL
;
564 mmp_thread_exit(mmp
, &mmp
->mmp_thread
, &cpr
);
568 * Signal the MMP thread to wake it, when it is sleeping on
569 * its cv. Used when some module parameter has changed and
570 * we want the thread to know about it.
571 * Only signal if the pool is active and mmp thread is
572 * running, otherwise there is no thread to wake.
575 mmp_signal_thread(spa_t
*spa
)
577 mmp_thread_t
*mmp
= &spa
->spa_mmp
;
579 mutex_enter(&mmp
->mmp_thread_lock
);
581 cv_broadcast(&mmp
->mmp_thread_cv
);
582 mutex_exit(&mmp
->mmp_thread_lock
);
586 mmp_signal_all_threads(void)
590 mutex_enter(&spa_namespace_lock
);
591 while ((spa
= spa_next(spa
))) {
592 if (spa
->spa_state
== POOL_STATE_ACTIVE
)
593 mmp_signal_thread(spa
);
595 mutex_exit(&spa_namespace_lock
);
598 #if defined(_KERNEL) && defined(HAVE_SPL)
599 #include <linux/mod_compat.h>
602 param_set_multihost_interval(const char *val
, zfs_kernel_param_t
*kp
)
606 ret
= param_set_ulong(val
, kp
);
610 mmp_signal_all_threads();
616 module_param(zfs_multihost_fail_intervals
, uint
, 0644);
617 MODULE_PARM_DESC(zfs_multihost_fail_intervals
,
618 "Max allowed period without a successful mmp write");
620 module_param_call(zfs_multihost_interval
, param_set_multihost_interval
,
621 param_get_ulong
, &zfs_multihost_interval
, 0644);
622 MODULE_PARM_DESC(zfs_multihost_interval
,
623 "Milliseconds between mmp writes to each leaf");
625 module_param(zfs_multihost_import_intervals
, uint
, 0644);
626 MODULE_PARM_DESC(zfs_multihost_import_intervals
,
627 "Number of zfs_multihost_interval periods to wait for activity");