4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
22 * Copyright (c) 2017 by Lawrence Livermore National Security, LLC.
28 #include <sys/spa_impl.h>
31 #include <sys/vdev_impl.h>
32 #include <sys/zfs_context.h>
33 #include <sys/callb.h>
36 * Multi-Modifier Protection (MMP) attempts to prevent a user from importing
37 * or opening a pool on more than one host at a time. In particular, it
38 * prevents "zpool import -f" on a host from succeeding while the pool is
39 * already imported on another host. There are many other ways in which a
40 * device could be used by two hosts for different purposes at the same time
41 * resulting in pool damage. This implementation does not attempt to detect
44 * MMP operates by ensuring there are frequent visible changes on disk (a
45 * "heartbeat") at all times. And by altering the import process to check
46 * for these changes and failing the import when they are detected. This
47 * functionality is enabled by setting the 'multihost' pool property to on.
49 * Uberblocks written by the txg_sync thread always go into the first
50 * (N-MMP_BLOCKS_PER_LABEL) slots, the remaining slots are reserved for MMP.
51 * They are used to hold uberblocks which are exactly the same as the last
52 * synced uberblock except that the ub_timestamp is frequently updated.
53 * Like all other uberblocks, the slot is written with an embedded checksum,
54 * and slots with invalid checksums are ignored. This provides the
55 * "heartbeat", with no risk of overwriting good uberblocks that must be
56 * preserved, e.g. previous txgs and associated block pointers.
58 * Two optional fields are added to uberblock structure: ub_mmp_magic and
59 * ub_mmp_delay. The magic field allows zfs to tell whether ub_mmp_delay is
60 * valid. The delay field is a decaying average of the amount of time between
61 * completion of successive MMP writes, in nanoseconds. It is used to predict
62 * how long the import must wait to detect activity in the pool, before
63 * concluding it is not in use.
65 * During import an activity test may now be performed to determine if
66 * the pool is in use. The activity test is typically required if the
67 * ZPOOL_CONFIG_HOSTID does not match the system hostid, the pool state is
68 * POOL_STATE_ACTIVE, and the pool is not a root pool.
70 * The activity test finds the "best" uberblock (highest txg & timestamp),
71 * waits some time, and then finds the "best" uberblock again. If the txg
72 * and timestamp in both "best" uberblocks do not match, the pool is in use
73 * by another host and the import fails. Since the granularity of the
74 * timestamp is in seconds this activity test must take a bare minimum of one
75 * second. In order to assure the accuracy of the activity test, the default
76 * values result in an activity test duration of 10x the mmp write interval.
78 * The "zpool import" activity test can be expected to take a minimum time of
79 * zfs_multihost_import_intervals * zfs_multihost_interval milliseconds. If the
80 * "best" uberblock has a valid ub_mmp_delay field, then the duration of the
81 * test may take longer if MMP writes were occurring less frequently than
82 * expected. Additionally, the duration is then extended by a random 25% to
83 * attempt to to detect simultaneous imports. For example, if both partner
84 * hosts are rebooted at the same time and automatically attempt to import the
89 * Used to control the frequency of mmp writes which are performed when the
90 * 'multihost' pool property is on. This is one factor used to determine the
91 * length of the activity check during import.
93 * The mmp write period is zfs_multihost_interval / leaf-vdevs milliseconds.
94 * This means that on average an mmp write will be issued for each leaf vdev
95 * every zfs_multihost_interval milliseconds. In practice, the observed period
96 * can vary with the I/O load and this observed value is the delay which is
97 * stored in the uberblock. The minimum allowed value is 100 ms.
99 ulong_t zfs_multihost_interval
= MMP_DEFAULT_INTERVAL
;
102 * Used to control the duration of the activity test on import. Smaller values
103 * of zfs_multihost_import_intervals will reduce the import time but increase
104 * the risk of failing to detect an active pool. The total activity check time
105 * is never allowed to drop below one second. A value of 0 is ignored and
106 * treated as if it was set to 1.
108 uint_t zfs_multihost_import_intervals
= MMP_DEFAULT_IMPORT_INTERVALS
;
111 * Controls the behavior of the pool when mmp write failures are detected.
113 * When zfs_multihost_fail_intervals = 0 then mmp write failures are ignored.
114 * The failures will still be reported to the ZED which depending on its
115 * configuration may take action such as suspending the pool or taking a
118 * When zfs_multihost_fail_intervals > 0 then sequential mmp write failures will
119 * cause the pool to be suspended. This occurs when
120 * zfs_multihost_fail_intervals * zfs_multihost_interval milliseconds have
121 * passed since the last successful mmp write. This guarantees the activity
122 * test will see mmp writes if the
125 uint_t zfs_multihost_fail_intervals
= MMP_DEFAULT_FAIL_INTERVALS
;
127 char *mmp_tag
= "mmp_write_uberblock";
128 static void mmp_thread(void *arg
);
133 mmp_thread_t
*mmp
= &spa
->spa_mmp
;
135 mutex_init(&mmp
->mmp_thread_lock
, NULL
, MUTEX_DEFAULT
, NULL
);
136 cv_init(&mmp
->mmp_thread_cv
, NULL
, CV_DEFAULT
, NULL
);
137 mutex_init(&mmp
->mmp_io_lock
, NULL
, MUTEX_DEFAULT
, NULL
);
138 mmp
->mmp_kstat_id
= 1;
144 mmp_thread_t
*mmp
= &spa
->spa_mmp
;
146 mutex_destroy(&mmp
->mmp_thread_lock
);
147 cv_destroy(&mmp
->mmp_thread_cv
);
148 mutex_destroy(&mmp
->mmp_io_lock
);
152 mmp_thread_enter(mmp_thread_t
*mmp
, callb_cpr_t
*cpr
)
154 CALLB_CPR_INIT(cpr
, &mmp
->mmp_thread_lock
, callb_generic_cpr
, FTAG
);
155 mutex_enter(&mmp
->mmp_thread_lock
);
159 mmp_thread_exit(mmp_thread_t
*mmp
, kthread_t
**mpp
, callb_cpr_t
*cpr
)
161 ASSERT(*mpp
!= NULL
);
163 cv_broadcast(&mmp
->mmp_thread_cv
);
164 CALLB_CPR_EXIT(cpr
); /* drops &mmp->mmp_thread_lock */
169 mmp_thread_start(spa_t
*spa
)
171 mmp_thread_t
*mmp
= &spa
->spa_mmp
;
173 if (spa_writeable(spa
)) {
174 mutex_enter(&mmp
->mmp_thread_lock
);
175 if (!mmp
->mmp_thread
) {
176 dprintf("mmp_thread_start pool %s\n",
178 mmp
->mmp_thread
= thread_create(NULL
, 0, mmp_thread
,
179 spa
, 0, &p0
, TS_RUN
, defclsyspri
);
181 mutex_exit(&mmp
->mmp_thread_lock
);
186 mmp_thread_stop(spa_t
*spa
)
188 mmp_thread_t
*mmp
= &spa
->spa_mmp
;
190 mutex_enter(&mmp
->mmp_thread_lock
);
191 mmp
->mmp_thread_exiting
= 1;
192 cv_broadcast(&mmp
->mmp_thread_cv
);
194 while (mmp
->mmp_thread
) {
195 cv_wait(&mmp
->mmp_thread_cv
, &mmp
->mmp_thread_lock
);
197 mutex_exit(&mmp
->mmp_thread_lock
);
199 ASSERT(mmp
->mmp_thread
== NULL
);
200 mmp
->mmp_thread_exiting
= 0;
203 typedef enum mmp_vdev_state_flag
{
204 MMP_FAIL_NOT_WRITABLE
= (1 << 0),
205 MMP_FAIL_WRITE_PENDING
= (1 << 1),
206 } mmp_vdev_state_flag_t
;
209 mmp_random_leaf_impl(vdev_t
*vd
, int *fail_mask
)
213 if (vd
->vdev_ops
->vdev_op_leaf
) {
216 if (!vdev_writeable(vd
)) {
217 *fail_mask
|= MMP_FAIL_NOT_WRITABLE
;
219 } else if (vd
->vdev_mmp_pending
!= 0) {
220 *fail_mask
|= MMP_FAIL_WRITE_PENDING
;
229 if (vd
->vdev_children
== 0)
232 child_idx
= spa_get_random(vd
->vdev_children
);
233 for (int offset
= vd
->vdev_children
; offset
> 0; offset
--) {
235 vdev_t
*child
= vd
->vdev_child
[(child_idx
+ offset
) %
238 leaf
= mmp_random_leaf_impl(child
, fail_mask
);
247 * Find a leaf vdev to write an MMP block to. It must not have an outstanding
248 * mmp write (if so a new write will also likely block). If there is no usable
249 * leaf in the tree rooted at in_vd, a nonzero error value is returned, and
250 * *out_vd is unchanged.
252 * The error value returned is a bit field.
254 * MMP_FAIL_WRITE_PENDING
255 * If set, one or more leaf vdevs are writeable, but have an MMP write which has
258 * MMP_FAIL_NOT_WRITABLE
259 * If set, one or more vdevs are not writeable. The children of those vdevs
262 * Assuming in_vd points to a tree, a random subtree will be chosen to start.
263 * That subtree, and successive ones, will be walked until a usable leaf has
264 * been found, or all subtrees have been examined (except that the children of
265 * un-writeable vdevs are not examined).
267 * If the leaf vdevs in the tree are healthy, the distribution of returned leaf
268 * vdevs will be even. If there are unhealthy leaves, the following leaves
269 * (child_index % index_children) will be chosen more often.
273 mmp_random_leaf(vdev_t
*in_vd
, vdev_t
**out_vd
)
276 vdev_t
*vd
= mmp_random_leaf_impl(in_vd
, &error_mask
);
285 * MMP writes are issued on a fixed schedule, but may complete at variable,
286 * much longer, intervals. The mmp_delay captures long periods between
287 * successful writes for any reason, including disk latency, scheduling delays,
290 * The mmp_delay is usually calculated as a decaying average, but if the latest
291 * delay is higher we do not average it, so that we do not hide sudden spikes
292 * which the importing host must wait for.
294 * If writes are occurring frequently, such as due to a high rate of txg syncs,
295 * the mmp_delay could become very small. Since those short delays depend on
296 * activity we cannot count on, we never allow mmp_delay to get lower than rate
297 * expected if only mmp_thread writes occur.
299 * If an mmp write was skipped or fails, and we have already waited longer than
300 * mmp_delay, we need to update it so the next write reflects the longer delay.
302 * Do not set mmp_delay if the multihost property is not on, so as not to
303 * trigger an activity check on import.
306 mmp_delay_update(spa_t
*spa
, boolean_t write_completed
)
308 mmp_thread_t
*mts
= &spa
->spa_mmp
;
309 hrtime_t delay
= gethrtime() - mts
->mmp_last_write
;
311 ASSERT(MUTEX_HELD(&mts
->mmp_io_lock
));
313 if (spa_multihost(spa
) == B_FALSE
) {
318 if (delay
> mts
->mmp_delay
)
319 mts
->mmp_delay
= delay
;
321 if (write_completed
== B_FALSE
)
324 mts
->mmp_last_write
= gethrtime();
327 * strictly less than, in case delay was changed above.
329 if (delay
< mts
->mmp_delay
) {
330 hrtime_t min_delay
= MSEC2NSEC(zfs_multihost_interval
) /
331 MAX(1, vdev_count_leaves(spa
));
332 mts
->mmp_delay
= MAX(((delay
+ mts
->mmp_delay
* 127) / 128),
338 mmp_write_done(zio_t
*zio
)
340 spa_t
*spa
= zio
->io_spa
;
341 vdev_t
*vd
= zio
->io_vd
;
342 mmp_thread_t
*mts
= zio
->io_private
;
344 mutex_enter(&mts
->mmp_io_lock
);
345 uint64_t mmp_kstat_id
= vd
->vdev_mmp_kstat_id
;
346 hrtime_t mmp_write_duration
= gethrtime() - vd
->vdev_mmp_pending
;
348 mmp_delay_update(spa
, (zio
->io_error
== 0));
350 vd
->vdev_mmp_pending
= 0;
351 vd
->vdev_mmp_kstat_id
= 0;
353 mutex_exit(&mts
->mmp_io_lock
);
354 spa_config_exit(spa
, SCL_STATE
, mmp_tag
);
356 spa_mmp_history_set(spa
, mmp_kstat_id
, zio
->io_error
,
359 abd_free(zio
->io_abd
);
363 * When the uberblock on-disk is updated by a spa_sync,
364 * creating a new "best" uberblock, update the one stored
365 * in the mmp thread state, used for mmp writes.
368 mmp_update_uberblock(spa_t
*spa
, uberblock_t
*ub
)
370 mmp_thread_t
*mmp
= &spa
->spa_mmp
;
372 mutex_enter(&mmp
->mmp_io_lock
);
374 mmp
->mmp_ub
.ub_timestamp
= gethrestime_sec();
375 mmp_delay_update(spa
, B_TRUE
);
376 mutex_exit(&mmp
->mmp_io_lock
);
380 * Choose a random vdev, label, and MMP block, and write over it
381 * with a copy of the last-synced uberblock, whose timestamp
382 * has been updated to reflect that the pool is in use.
385 mmp_write_uberblock(spa_t
*spa
)
387 int flags
= ZIO_FLAG_CONFIG_WRITER
| ZIO_FLAG_CANFAIL
;
388 mmp_thread_t
*mmp
= &spa
->spa_mmp
;
394 hrtime_t lock_acquire_time
= gethrtime();
395 spa_config_enter(spa
, SCL_STATE
, mmp_tag
, RW_READER
);
396 lock_acquire_time
= gethrtime() - lock_acquire_time
;
397 if (lock_acquire_time
> (MSEC2NSEC(MMP_MIN_INTERVAL
) / 10))
398 zfs_dbgmsg("SCL_STATE acquisition took %llu ns\n",
399 (u_longlong_t
)lock_acquire_time
);
401 error
= mmp_random_leaf(spa
->spa_root_vdev
, &vd
);
403 mutex_enter(&mmp
->mmp_io_lock
);
406 * spa_mmp_history has two types of entries:
407 * Issued MMP write: records time issued, error status, etc.
408 * Skipped MMP write: an MMP write could not be issued because no
409 * suitable leaf vdev was available. See comment above struct
410 * spa_mmp_history for details.
414 mmp_delay_update(spa
, B_FALSE
);
415 if (mmp
->mmp_skip_error
== error
) {
416 spa_mmp_history_set_skip(spa
, mmp
->mmp_kstat_id
- 1);
418 mmp
->mmp_skip_error
= error
;
419 spa_mmp_history_add(spa
, mmp
->mmp_ub
.ub_txg
,
420 gethrestime_sec(), mmp
->mmp_delay
, NULL
, 0,
421 mmp
->mmp_kstat_id
++, error
);
423 mutex_exit(&mmp
->mmp_io_lock
);
424 spa_config_exit(spa
, SCL_STATE
, FTAG
);
428 mmp
->mmp_skip_error
= 0;
430 if (mmp
->mmp_zio_root
== NULL
)
431 mmp
->mmp_zio_root
= zio_root(spa
, NULL
, NULL
,
432 flags
| ZIO_FLAG_GODFATHER
);
435 ub
->ub_timestamp
= gethrestime_sec();
436 ub
->ub_mmp_magic
= MMP_MAGIC
;
437 ub
->ub_mmp_delay
= mmp
->mmp_delay
;
438 vd
->vdev_mmp_pending
= gethrtime();
439 vd
->vdev_mmp_kstat_id
= mmp
->mmp_kstat_id
;
441 zio_t
*zio
= zio_null(mmp
->mmp_zio_root
, spa
, NULL
, NULL
, NULL
, flags
);
442 abd_t
*ub_abd
= abd_alloc_for_io(VDEV_UBERBLOCK_SIZE(vd
), B_TRUE
);
443 abd_zero(ub_abd
, VDEV_UBERBLOCK_SIZE(vd
));
444 abd_copy_from_buf(ub_abd
, ub
, sizeof (uberblock_t
));
447 mutex_exit(&mmp
->mmp_io_lock
);
449 offset
= VDEV_UBERBLOCK_OFFSET(vd
, VDEV_UBERBLOCK_COUNT(vd
) -
450 MMP_BLOCKS_PER_LABEL
+ spa_get_random(MMP_BLOCKS_PER_LABEL
));
452 label
= spa_get_random(VDEV_LABELS
);
453 vdev_label_write(zio
, vd
, label
, ub_abd
, offset
,
454 VDEV_UBERBLOCK_SIZE(vd
), mmp_write_done
, mmp
,
455 flags
| ZIO_FLAG_DONT_PROPAGATE
);
457 (void) spa_mmp_history_add(spa
, ub
->ub_txg
, ub
->ub_timestamp
,
458 ub
->ub_mmp_delay
, vd
, label
, vd
->vdev_mmp_kstat_id
, 0);
464 mmp_thread(void *arg
)
466 spa_t
*spa
= (spa_t
*)arg
;
467 mmp_thread_t
*mmp
= &spa
->spa_mmp
;
468 boolean_t last_spa_suspended
= spa_suspended(spa
);
469 boolean_t last_spa_multihost
= spa_multihost(spa
);
471 hrtime_t max_fail_ns
= zfs_multihost_fail_intervals
*
472 MSEC2NSEC(MAX(zfs_multihost_interval
, MMP_MIN_INTERVAL
));
474 mmp_thread_enter(mmp
, &cpr
);
477 * The mmp_write_done() function calculates mmp_delay based on the
478 * prior value of mmp_delay and the elapsed time since the last write.
479 * For the first mmp write, there is no "last write", so we start
480 * with fake, but reasonable, default non-zero values.
482 mmp
->mmp_delay
= MSEC2NSEC(MAX(zfs_multihost_interval
,
483 MMP_MIN_INTERVAL
)) / MAX(vdev_count_leaves(spa
), 1);
484 mmp
->mmp_last_write
= gethrtime() - mmp
->mmp_delay
;
486 while (!mmp
->mmp_thread_exiting
) {
487 uint64_t mmp_fail_intervals
= zfs_multihost_fail_intervals
;
488 uint64_t mmp_interval
= MSEC2NSEC(
489 MAX(zfs_multihost_interval
, MMP_MIN_INTERVAL
));
490 boolean_t suspended
= spa_suspended(spa
);
491 boolean_t multihost
= spa_multihost(spa
);
495 next_time
= gethrtime() + mmp_interval
/
496 MAX(vdev_count_leaves(spa
), 1);
498 next_time
= gethrtime() +
499 MSEC2NSEC(MMP_DEFAULT_INTERVAL
);
502 * MMP off => on, or suspended => !suspended:
503 * No writes occurred recently. Update mmp_last_write to give
504 * us some time to try.
506 if ((!last_spa_multihost
&& multihost
) ||
507 (last_spa_suspended
&& !suspended
)) {
508 mutex_enter(&mmp
->mmp_io_lock
);
509 mmp
->mmp_last_write
= gethrtime();
510 mutex_exit(&mmp
->mmp_io_lock
);
515 * mmp_delay == 0 tells importing node to skip activity check.
517 if (last_spa_multihost
&& !multihost
) {
518 mutex_enter(&mmp
->mmp_io_lock
);
520 mutex_exit(&mmp
->mmp_io_lock
);
522 last_spa_multihost
= multihost
;
523 last_spa_suspended
= suspended
;
526 * Smooth max_fail_ns when its factors are decreased, because
527 * making (max_fail_ns < mmp_interval) results in the pool being
528 * immediately suspended before writes can occur at the new
531 if ((mmp_interval
* mmp_fail_intervals
) < max_fail_ns
) {
532 max_fail_ns
= ((31 * max_fail_ns
) + (mmp_interval
*
533 mmp_fail_intervals
)) / 32;
535 max_fail_ns
= mmp_interval
* mmp_fail_intervals
;
539 * Suspend the pool if no MMP write has succeeded in over
540 * mmp_interval * mmp_fail_intervals nanoseconds.
542 if (!suspended
&& mmp_fail_intervals
&& multihost
&&
543 (gethrtime() - mmp
->mmp_last_write
) > max_fail_ns
) {
544 cmn_err(CE_WARN
, "MMP writes to pool '%s' have not "
545 "succeeded in over %llus; suspending pool",
547 NSEC2SEC(gethrtime() - mmp
->mmp_last_write
));
548 zio_suspend(spa
, NULL
, ZIO_SUSPEND_MMP
);
551 if (multihost
&& !suspended
)
552 mmp_write_uberblock(spa
);
554 CALLB_CPR_SAFE_BEGIN(&cpr
);
555 (void) cv_timedwait_sig_hires(&mmp
->mmp_thread_cv
,
556 &mmp
->mmp_thread_lock
, next_time
, USEC2NSEC(1),
557 CALLOUT_FLAG_ABSOLUTE
);
558 CALLB_CPR_SAFE_END(&cpr
, &mmp
->mmp_thread_lock
);
561 /* Outstanding writes are allowed to complete. */
562 if (mmp
->mmp_zio_root
)
563 zio_wait(mmp
->mmp_zio_root
);
565 mmp
->mmp_zio_root
= NULL
;
566 mmp_thread_exit(mmp
, &mmp
->mmp_thread
, &cpr
);
570 * Signal the MMP thread to wake it, when it is sleeping on
571 * its cv. Used when some module parameter has changed and
572 * we want the thread to know about it.
573 * Only signal if the pool is active and mmp thread is
574 * running, otherwise there is no thread to wake.
577 mmp_signal_thread(spa_t
*spa
)
579 mmp_thread_t
*mmp
= &spa
->spa_mmp
;
581 mutex_enter(&mmp
->mmp_thread_lock
);
583 cv_broadcast(&mmp
->mmp_thread_cv
);
584 mutex_exit(&mmp
->mmp_thread_lock
);
588 mmp_signal_all_threads(void)
592 mutex_enter(&spa_namespace_lock
);
593 while ((spa
= spa_next(spa
))) {
594 if (spa
->spa_state
== POOL_STATE_ACTIVE
)
595 mmp_signal_thread(spa
);
597 mutex_exit(&spa_namespace_lock
);
601 #include <linux/mod_compat.h>
604 param_set_multihost_interval(const char *val
, zfs_kernel_param_t
*kp
)
608 ret
= param_set_ulong(val
, kp
);
612 if (spa_mode_global
!= 0)
613 mmp_signal_all_threads();
619 module_param(zfs_multihost_fail_intervals
, uint
, 0644);
620 MODULE_PARM_DESC(zfs_multihost_fail_intervals
,
621 "Max allowed period without a successful mmp write");
623 module_param_call(zfs_multihost_interval
, param_set_multihost_interval
,
624 param_get_ulong
, &zfs_multihost_interval
, 0644);
625 MODULE_PARM_DESC(zfs_multihost_interval
,
626 "Milliseconds between mmp writes to each leaf");
628 module_param(zfs_multihost_import_intervals
, uint
, 0644);
629 MODULE_PARM_DESC(zfs_multihost_import_intervals
,
630 "Number of zfs_multihost_interval periods to wait for activity");