4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
22 * Copyright (c) 2017 by Lawrence Livermore National Security, LLC.
28 #include <sys/spa_impl.h>
31 #include <sys/vdev_impl.h>
32 #include <sys/zfs_context.h>
33 #include <sys/callb.h>
36 * Multi-Modifier Protection (MMP) attempts to prevent a user from importing
37 * or opening a pool on more than one host at a time. In particular, it
38 * prevents "zpool import -f" on a host from succeeding while the pool is
39 * already imported on another host. There are many other ways in which a
40 * device could be used by two hosts for different purposes at the same time
41 * resulting in pool damage. This implementation does not attempt to detect
44 * MMP operates by ensuring there are frequent visible changes on disk (a
45 * "heartbeat") at all times. And by altering the import process to check
46 * for these changes and failing the import when they are detected. This
47 * functionality is enabled by setting the 'multihost' pool property to on.
49 * Uberblocks written by the txg_sync thread always go into the first
50 * (N-MMP_BLOCKS_PER_LABEL) slots, the remaining slots are reserved for MMP.
51 * They are used to hold uberblocks which are exactly the same as the last
52 * synced uberblock except that the ub_timestamp is frequently updated.
53 * Like all other uberblocks, the slot is written with an embedded checksum,
54 * and slots with invalid checksums are ignored. This provides the
55 * "heartbeat", with no risk of overwriting good uberblocks that must be
56 * preserved, e.g. previous txgs and associated block pointers.
58 * Two optional fields are added to uberblock structure: ub_mmp_magic and
59 * ub_mmp_delay. The magic field allows zfs to tell whether ub_mmp_delay is
60 * valid. The delay field is a decaying average of the amount of time between
61 * completion of successive MMP writes, in nanoseconds. It is used to predict
62 * how long the import must wait to detect activity in the pool, before
63 * concluding it is not in use.
65 * During import an activity test may now be performed to determine if
66 * the pool is in use. The activity test is typically required if the
67 * ZPOOL_CONFIG_HOSTID does not match the system hostid, the pool state is
68 * POOL_STATE_ACTIVE, and the pool is not a root pool.
70 * The activity test finds the "best" uberblock (highest txg & timestamp),
71 * waits some time, and then finds the "best" uberblock again. If the txg
72 * and timestamp in both "best" uberblocks do not match, the pool is in use
73 * by another host and the import fails. Since the granularity of the
74 * timestamp is in seconds this activity test must take a bare minimum of one
75 * second. In order to assure the accuracy of the activity test, the default
76 * values result in an activity test duration of 10x the mmp write interval.
78 * The "zpool import" activity test can be expected to take a minimum time of
79 * zfs_multihost_import_intervals * zfs_multihost_interval milliseconds. If the
80 * "best" uberblock has a valid ub_mmp_delay field, then the duration of the
81 * test may take longer if MMP writes were occurring less frequently than
82 * expected. Additionally, the duration is then extended by a random 25% to
83 * attempt to to detect simultaneous imports. For example, if both partner
84 * hosts are rebooted at the same time and automatically attempt to import the
89 * Used to control the frequency of mmp writes which are performed when the
90 * 'multihost' pool property is on. This is one factor used to determine the
91 * length of the activity check during import.
93 * The mmp write period is zfs_multihost_interval / leaf-vdevs milliseconds.
94 * This means that on average an mmp write will be issued for each leaf vdev
95 * every zfs_multihost_interval milliseconds. In practice, the observed period
96 * can vary with the I/O load and this observed value is the delay which is
97 * stored in the uberblock. The minimum allowed value is 100 ms.
99 ulong_t zfs_multihost_interval
= MMP_DEFAULT_INTERVAL
;
102 * Used to control the duration of the activity test on import. Smaller values
103 * of zfs_multihost_import_intervals will reduce the import time but increase
104 * the risk of failing to detect an active pool. The total activity check time
105 * is never allowed to drop below one second. A value of 0 is ignored and
106 * treated as if it was set to 1.
108 uint_t zfs_multihost_import_intervals
= MMP_DEFAULT_IMPORT_INTERVALS
;
111 * Controls the behavior of the pool when mmp write failures are detected.
113 * When zfs_multihost_fail_intervals = 0 then mmp write failures are ignored.
114 * The failures will still be reported to the ZED which depending on its
115 * configuration may take action such as suspending the pool or taking a
118 * When zfs_multihost_fail_intervals > 0 then sequential mmp write failures will
119 * cause the pool to be suspended. This occurs when
120 * zfs_multihost_fail_intervals * zfs_multihost_interval milliseconds have
121 * passed since the last successful mmp write. This guarantees the activity
122 * test will see mmp writes if the
125 uint_t zfs_multihost_fail_intervals
= MMP_DEFAULT_FAIL_INTERVALS
;
127 char *mmp_tag
= "mmp_write_uberblock";
128 static void mmp_thread(void *arg
);
133 mmp_thread_t
*mmp
= &spa
->spa_mmp
;
135 mutex_init(&mmp
->mmp_thread_lock
, NULL
, MUTEX_DEFAULT
, NULL
);
136 cv_init(&mmp
->mmp_thread_cv
, NULL
, CV_DEFAULT
, NULL
);
137 mutex_init(&mmp
->mmp_io_lock
, NULL
, MUTEX_DEFAULT
, NULL
);
138 mmp
->mmp_kstat_id
= 1;
144 mmp_thread_t
*mmp
= &spa
->spa_mmp
;
146 mutex_destroy(&mmp
->mmp_thread_lock
);
147 cv_destroy(&mmp
->mmp_thread_cv
);
148 mutex_destroy(&mmp
->mmp_io_lock
);
152 mmp_thread_enter(mmp_thread_t
*mmp
, callb_cpr_t
*cpr
)
154 CALLB_CPR_INIT(cpr
, &mmp
->mmp_thread_lock
, callb_generic_cpr
, FTAG
);
155 mutex_enter(&mmp
->mmp_thread_lock
);
159 mmp_thread_exit(mmp_thread_t
*mmp
, kthread_t
**mpp
, callb_cpr_t
*cpr
)
161 ASSERT(*mpp
!= NULL
);
163 cv_broadcast(&mmp
->mmp_thread_cv
);
164 CALLB_CPR_EXIT(cpr
); /* drops &mmp->mmp_thread_lock */
169 mmp_thread_start(spa_t
*spa
)
171 mmp_thread_t
*mmp
= &spa
->spa_mmp
;
173 if (spa_writeable(spa
)) {
174 mutex_enter(&mmp
->mmp_thread_lock
);
175 if (!mmp
->mmp_thread
) {
176 dprintf("mmp_thread_start pool %s\n",
178 mmp
->mmp_thread
= thread_create(NULL
, 0, mmp_thread
,
179 spa
, 0, &p0
, TS_RUN
, defclsyspri
);
181 mutex_exit(&mmp
->mmp_thread_lock
);
186 mmp_thread_stop(spa_t
*spa
)
188 mmp_thread_t
*mmp
= &spa
->spa_mmp
;
190 mutex_enter(&mmp
->mmp_thread_lock
);
191 mmp
->mmp_thread_exiting
= 1;
192 cv_broadcast(&mmp
->mmp_thread_cv
);
194 while (mmp
->mmp_thread
) {
195 cv_wait(&mmp
->mmp_thread_cv
, &mmp
->mmp_thread_lock
);
197 mutex_exit(&mmp
->mmp_thread_lock
);
199 ASSERT(mmp
->mmp_thread
== NULL
);
200 mmp
->mmp_thread_exiting
= 0;
203 typedef enum mmp_vdev_state_flag
{
204 MMP_FAIL_NOT_WRITABLE
= (1 << 0),
205 MMP_FAIL_WRITE_PENDING
= (1 << 1),
206 } mmp_vdev_state_flag_t
;
209 mmp_random_leaf_impl(vdev_t
*vd
, int *fail_mask
)
213 if (!vdev_writeable(vd
)) {
214 *fail_mask
|= MMP_FAIL_NOT_WRITABLE
;
218 if (vd
->vdev_ops
->vdev_op_leaf
) {
221 if (vd
->vdev_mmp_pending
!= 0) {
222 *fail_mask
|= MMP_FAIL_WRITE_PENDING
;
231 child_idx
= spa_get_random(vd
->vdev_children
);
232 for (int offset
= vd
->vdev_children
; offset
> 0; offset
--) {
234 vdev_t
*child
= vd
->vdev_child
[(child_idx
+ offset
) %
237 leaf
= mmp_random_leaf_impl(child
, fail_mask
);
246 * Find a leaf vdev to write an MMP block to. It must not have an outstanding
247 * mmp write (if so a new write will also likely block). If there is no usable
248 * leaf in the tree rooted at in_vd, a nonzero error value is returned, and
249 * *out_vd is unchanged.
251 * The error value returned is a bit field.
253 * MMP_FAIL_WRITE_PENDING
254 * If set, one or more leaf vdevs are writeable, but have an MMP write which has
257 * MMP_FAIL_NOT_WRITABLE
258 * If set, one or more vdevs are not writeable. The children of those vdevs
261 * Assuming in_vd points to a tree, a random subtree will be chosen to start.
262 * That subtree, and successive ones, will be walked until a usable leaf has
263 * been found, or all subtrees have been examined (except that the children of
264 * un-writeable vdevs are not examined).
266 * If the leaf vdevs in the tree are healthy, the distribution of returned leaf
267 * vdevs will be even. If there are unhealthy leaves, the following leaves
268 * (child_index % index_children) will be chosen more often.
272 mmp_random_leaf(vdev_t
*in_vd
, vdev_t
**out_vd
)
275 vdev_t
*vd
= mmp_random_leaf_impl(in_vd
, &error_mask
);
284 mmp_write_done(zio_t
*zio
)
286 spa_t
*spa
= zio
->io_spa
;
287 vdev_t
*vd
= zio
->io_vd
;
288 mmp_thread_t
*mts
= zio
->io_private
;
290 mutex_enter(&mts
->mmp_io_lock
);
291 uint64_t mmp_kstat_id
= vd
->vdev_mmp_kstat_id
;
292 hrtime_t mmp_write_duration
= gethrtime() - vd
->vdev_mmp_pending
;
298 * Mmp writes are queued on a fixed schedule, but under many
299 * circumstances, such as a busy device or faulty hardware,
300 * the writes will complete at variable, much longer,
301 * intervals. In these cases, another node checking for
302 * activity must wait longer to account for these delays.
304 * The mmp_delay is calculated as a decaying average of the interval
305 * between completed mmp writes. This is used to predict how long
306 * the import must wait to detect activity in the pool, before
307 * concluding it is not in use.
309 * Do not set mmp_delay if the multihost property is not on,
310 * so as not to trigger an activity check on import.
312 if (spa_multihost(spa
)) {
313 hrtime_t delay
= gethrtime() - mts
->mmp_last_write
;
315 if (delay
> mts
->mmp_delay
)
316 mts
->mmp_delay
= delay
;
318 mts
->mmp_delay
= (delay
+ mts
->mmp_delay
* 127) /
323 mts
->mmp_last_write
= gethrtime();
326 vd
->vdev_mmp_pending
= 0;
327 vd
->vdev_mmp_kstat_id
= 0;
329 mutex_exit(&mts
->mmp_io_lock
);
330 spa_config_exit(spa
, SCL_STATE
, mmp_tag
);
332 spa_mmp_history_set(spa
, mmp_kstat_id
, zio
->io_error
,
335 abd_free(zio
->io_abd
);
339 * When the uberblock on-disk is updated by a spa_sync,
340 * creating a new "best" uberblock, update the one stored
341 * in the mmp thread state, used for mmp writes.
344 mmp_update_uberblock(spa_t
*spa
, uberblock_t
*ub
)
346 mmp_thread_t
*mmp
= &spa
->spa_mmp
;
348 mutex_enter(&mmp
->mmp_io_lock
);
350 mmp
->mmp_ub
.ub_timestamp
= gethrestime_sec();
351 mutex_exit(&mmp
->mmp_io_lock
);
355 * Choose a random vdev, label, and MMP block, and write over it
356 * with a copy of the last-synced uberblock, whose timestamp
357 * has been updated to reflect that the pool is in use.
360 mmp_write_uberblock(spa_t
*spa
)
362 int flags
= ZIO_FLAG_CONFIG_WRITER
| ZIO_FLAG_CANFAIL
;
363 mmp_thread_t
*mmp
= &spa
->spa_mmp
;
369 hrtime_t lock_acquire_time
= gethrtime();
370 spa_config_enter(spa
, SCL_STATE
, mmp_tag
, RW_READER
);
371 lock_acquire_time
= gethrtime() - lock_acquire_time
;
372 if (lock_acquire_time
> (MSEC2NSEC(MMP_MIN_INTERVAL
) / 10))
373 zfs_dbgmsg("SCL_STATE acquisition took %llu ns\n",
374 (u_longlong_t
)lock_acquire_time
);
376 error
= mmp_random_leaf(spa
->spa_root_vdev
, &vd
);
378 mutex_enter(&mmp
->mmp_io_lock
);
381 * spa_mmp_history has two types of entries:
382 * Issued MMP write: records time issued, error status, etc.
383 * Skipped MMP write: an MMP write could not be issued because no
384 * suitable leaf vdev was available. See comment above struct
385 * spa_mmp_history for details.
389 if (mmp
->mmp_skip_error
== error
) {
390 spa_mmp_history_set_skip(spa
, mmp
->mmp_kstat_id
- 1);
392 mmp
->mmp_skip_error
= error
;
393 spa_mmp_history_add(spa
, mmp
->mmp_ub
.ub_txg
,
394 gethrestime_sec(), mmp
->mmp_delay
, NULL
, 0,
395 mmp
->mmp_kstat_id
++, error
);
397 mutex_exit(&mmp
->mmp_io_lock
);
398 spa_config_exit(spa
, SCL_STATE
, FTAG
);
402 mmp
->mmp_skip_error
= 0;
404 if (mmp
->mmp_zio_root
== NULL
)
405 mmp
->mmp_zio_root
= zio_root(spa
, NULL
, NULL
,
406 flags
| ZIO_FLAG_GODFATHER
);
409 ub
->ub_timestamp
= gethrestime_sec();
410 ub
->ub_mmp_magic
= MMP_MAGIC
;
411 ub
->ub_mmp_delay
= mmp
->mmp_delay
;
412 vd
->vdev_mmp_pending
= gethrtime();
413 vd
->vdev_mmp_kstat_id
= mmp
->mmp_kstat_id
;
415 zio_t
*zio
= zio_null(mmp
->mmp_zio_root
, spa
, NULL
, NULL
, NULL
, flags
);
416 abd_t
*ub_abd
= abd_alloc_for_io(VDEV_UBERBLOCK_SIZE(vd
), B_TRUE
);
417 abd_zero(ub_abd
, VDEV_UBERBLOCK_SIZE(vd
));
418 abd_copy_from_buf(ub_abd
, ub
, sizeof (uberblock_t
));
421 mutex_exit(&mmp
->mmp_io_lock
);
423 offset
= VDEV_UBERBLOCK_OFFSET(vd
, VDEV_UBERBLOCK_COUNT(vd
) -
424 MMP_BLOCKS_PER_LABEL
+ spa_get_random(MMP_BLOCKS_PER_LABEL
));
426 label
= spa_get_random(VDEV_LABELS
);
427 vdev_label_write(zio
, vd
, label
, ub_abd
, offset
,
428 VDEV_UBERBLOCK_SIZE(vd
), mmp_write_done
, mmp
,
429 flags
| ZIO_FLAG_DONT_PROPAGATE
);
431 (void) spa_mmp_history_add(spa
, ub
->ub_txg
, ub
->ub_timestamp
,
432 ub
->ub_mmp_delay
, vd
, label
, vd
->vdev_mmp_kstat_id
, 0);
438 mmp_thread(void *arg
)
440 spa_t
*spa
= (spa_t
*)arg
;
441 mmp_thread_t
*mmp
= &spa
->spa_mmp
;
442 boolean_t last_spa_suspended
= spa_suspended(spa
);
443 boolean_t last_spa_multihost
= spa_multihost(spa
);
445 hrtime_t max_fail_ns
= zfs_multihost_fail_intervals
*
446 MSEC2NSEC(MAX(zfs_multihost_interval
, MMP_MIN_INTERVAL
));
448 mmp_thread_enter(mmp
, &cpr
);
451 * The mmp_write_done() function calculates mmp_delay based on the
452 * prior value of mmp_delay and the elapsed time since the last write.
453 * For the first mmp write, there is no "last write", so we start
454 * with fake, but reasonable, default non-zero values.
456 mmp
->mmp_delay
= MSEC2NSEC(MAX(zfs_multihost_interval
,
457 MMP_MIN_INTERVAL
)) / MAX(vdev_count_leaves(spa
), 1);
458 mmp
->mmp_last_write
= gethrtime() - mmp
->mmp_delay
;
460 while (!mmp
->mmp_thread_exiting
) {
461 uint64_t mmp_fail_intervals
= zfs_multihost_fail_intervals
;
462 uint64_t mmp_interval
= MSEC2NSEC(
463 MAX(zfs_multihost_interval
, MMP_MIN_INTERVAL
));
464 boolean_t suspended
= spa_suspended(spa
);
465 boolean_t multihost
= spa_multihost(spa
);
466 hrtime_t start
, next_time
;
470 next_time
= start
+ mmp_interval
/
471 MAX(vdev_count_leaves(spa
), 1);
473 next_time
= start
+ MSEC2NSEC(MMP_DEFAULT_INTERVAL
);
477 * MMP off => on, or suspended => !suspended:
478 * No writes occurred recently. Update mmp_last_write to give
479 * us some time to try.
481 if ((!last_spa_multihost
&& multihost
) ||
482 (last_spa_suspended
&& !suspended
)) {
483 mutex_enter(&mmp
->mmp_io_lock
);
484 mmp
->mmp_last_write
= gethrtime();
485 mutex_exit(&mmp
->mmp_io_lock
);
490 * mmp_delay == 0 tells importing node to skip activity check.
492 if (last_spa_multihost
&& !multihost
) {
493 mutex_enter(&mmp
->mmp_io_lock
);
495 mutex_exit(&mmp
->mmp_io_lock
);
497 last_spa_multihost
= multihost
;
498 last_spa_suspended
= suspended
;
501 * Smooth max_fail_ns when its factors are decreased, because
502 * making (max_fail_ns < mmp_interval) results in the pool being
503 * immediately suspended before writes can occur at the new
506 if ((mmp_interval
* mmp_fail_intervals
) < max_fail_ns
) {
507 max_fail_ns
= ((31 * max_fail_ns
) + (mmp_interval
*
508 mmp_fail_intervals
)) / 32;
510 max_fail_ns
= mmp_interval
* mmp_fail_intervals
;
514 * Suspend the pool if no MMP write has succeeded in over
515 * mmp_interval * mmp_fail_intervals nanoseconds.
517 if (!suspended
&& mmp_fail_intervals
&& multihost
&&
518 (start
- mmp
->mmp_last_write
) > max_fail_ns
) {
519 cmn_err(CE_WARN
, "MMP writes to pool '%s' have not "
520 "succeeded in over %llus; suspending pool",
522 NSEC2SEC(start
- mmp
->mmp_last_write
));
523 zio_suspend(spa
, NULL
, ZIO_SUSPEND_MMP
);
526 if (multihost
&& !suspended
)
527 mmp_write_uberblock(spa
);
529 CALLB_CPR_SAFE_BEGIN(&cpr
);
530 (void) cv_timedwait_sig_hires(&mmp
->mmp_thread_cv
,
531 &mmp
->mmp_thread_lock
, next_time
, USEC2NSEC(1),
532 CALLOUT_FLAG_ABSOLUTE
);
533 CALLB_CPR_SAFE_END(&cpr
, &mmp
->mmp_thread_lock
);
536 /* Outstanding writes are allowed to complete. */
537 if (mmp
->mmp_zio_root
)
538 zio_wait(mmp
->mmp_zio_root
);
540 mmp
->mmp_zio_root
= NULL
;
541 mmp_thread_exit(mmp
, &mmp
->mmp_thread
, &cpr
);
545 * Signal the MMP thread to wake it, when it is sleeping on
546 * its cv. Used when some module parameter has changed and
547 * we want the thread to know about it.
548 * Only signal if the pool is active and mmp thread is
549 * running, otherwise there is no thread to wake.
552 mmp_signal_thread(spa_t
*spa
)
554 mmp_thread_t
*mmp
= &spa
->spa_mmp
;
556 mutex_enter(&mmp
->mmp_thread_lock
);
558 cv_broadcast(&mmp
->mmp_thread_cv
);
559 mutex_exit(&mmp
->mmp_thread_lock
);
563 mmp_signal_all_threads(void)
567 mutex_enter(&spa_namespace_lock
);
568 while ((spa
= spa_next(spa
))) {
569 if (spa
->spa_state
== POOL_STATE_ACTIVE
)
570 mmp_signal_thread(spa
);
572 mutex_exit(&spa_namespace_lock
);
575 #if defined(_KERNEL) && defined(HAVE_SPL)
576 #include <linux/mod_compat.h>
579 param_set_multihost_interval(const char *val
, zfs_kernel_param_t
*kp
)
583 ret
= param_set_ulong(val
, kp
);
587 mmp_signal_all_threads();
593 module_param(zfs_multihost_fail_intervals
, uint
, 0644);
594 MODULE_PARM_DESC(zfs_multihost_fail_intervals
,
595 "Max allowed period without a successful mmp write");
597 module_param_call(zfs_multihost_interval
, param_set_multihost_interval
,
598 param_get_ulong
, &zfs_multihost_interval
, 0644);
599 MODULE_PARM_DESC(zfs_multihost_interval
,
600 "Milliseconds between mmp writes to each leaf");
602 module_param(zfs_multihost_import_intervals
, uint
, 0644);
603 MODULE_PARM_DESC(zfs_multihost_import_intervals
,
604 "Number of zfs_multihost_interval periods to wait for activity");