4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
22 * Copyright (c) 2017 by Lawrence Livermore National Security, LLC.
28 #include <sys/spa_impl.h>
31 #include <sys/vdev_impl.h>
32 #include <sys/zfs_context.h>
33 #include <sys/callb.h>
36 * Multi-Modifier Protection (MMP) attempts to prevent a user from importing
37 * or opening a pool on more than one host at a time. In particular, it
38 * prevents "zpool import -f" on a host from succeeding while the pool is
39 * already imported on another host. There are many other ways in which a
40 * device could be used by two hosts for different purposes at the same time
41 * resulting in pool damage. This implementation does not attempt to detect
44 * MMP operates by ensuring there are frequent visible changes on disk (a
45 * "heartbeat") at all times. And by altering the import process to check
46 * for these changes and failing the import when they are detected. This
47 * functionality is enabled by setting the 'multihost' pool property to on.
49 * Uberblocks written by the txg_sync thread always go into the first
50 * (N-MMP_BLOCKS_PER_LABEL) slots, the remaining slots are reserved for MMP.
51 * They are used to hold uberblocks which are exactly the same as the last
52 * synced uberblock except that the ub_timestamp is frequently updated.
53 * Like all other uberblocks, the slot is written with an embedded checksum,
54 * and slots with invalid checksums are ignored. This provides the
55 * "heartbeat", with no risk of overwriting good uberblocks that must be
56 * preserved, e.g. previous txgs and associated block pointers.
58 * Two optional fields are added to uberblock structure: ub_mmp_magic and
59 * ub_mmp_delay. The magic field allows zfs to tell whether ub_mmp_delay is
60 * valid. The delay field is a decaying average of the amount of time between
61 * completion of successive MMP writes, in nanoseconds. It is used to predict
62 * how long the import must wait to detect activity in the pool, before
63 * concluding it is not in use.
65 * During import an activity test may now be performed to determine if
66 * the pool is in use. The activity test is typically required if the
67 * ZPOOL_CONFIG_HOSTID does not match the system hostid, the pool state is
68 * POOL_STATE_ACTIVE, and the pool is not a root pool.
70 * The activity test finds the "best" uberblock (highest txg & timestamp),
71 * waits some time, and then finds the "best" uberblock again. If the txg
72 * and timestamp in both "best" uberblocks do not match, the pool is in use
73 * by another host and the import fails. Since the granularity of the
74 * timestamp is in seconds this activity test must take a bare minimum of one
75 * second. In order to assure the accuracy of the activity test, the default
76 * values result in an activity test duration of 10x the mmp write interval.
78 * The "zpool import" activity test can be expected to take a minimum time of
79 * zfs_multihost_import_intervals * zfs_multihost_interval milliseconds. If the
80 * "best" uberblock has a valid ub_mmp_delay field, then the duration of the
81 * test may take longer if MMP writes were occurring less frequently than
82 * expected. Additionally, the duration is then extended by a random 25% to
83 * attempt to to detect simultaneous imports. For example, if both partner
84 * hosts are rebooted at the same time and automatically attempt to import the
89 * Used to control the frequency of mmp writes which are performed when the
90 * 'multihost' pool property is on. This is one factor used to determine the
91 * length of the activity check during import.
93 * The mmp write period is zfs_multihost_interval / leaf-vdevs milliseconds.
94 * This means that on average an mmp write will be issued for each leaf vdev
95 * every zfs_multihost_interval milliseconds. In practice, the observed period
96 * can vary with the I/O load and this observed value is the delay which is
97 * stored in the uberblock. The minimum allowed value is 100 ms.
99 ulong_t zfs_multihost_interval
= MMP_DEFAULT_INTERVAL
;
102 * Used to control the duration of the activity test on import. Smaller values
103 * of zfs_multihost_import_intervals will reduce the import time but increase
104 * the risk of failing to detect an active pool. The total activity check time
105 * is never allowed to drop below one second. A value of 0 is ignored and
106 * treated as if it was set to 1.
108 uint_t zfs_multihost_import_intervals
= MMP_DEFAULT_IMPORT_INTERVALS
;
111 * Controls the behavior of the pool when mmp write failures are detected.
113 * When zfs_multihost_fail_intervals = 0 then mmp write failures are ignored.
114 * The failures will still be reported to the ZED which depending on its
115 * configuration may take action such as suspending the pool or taking a
118 * When zfs_multihost_fail_intervals > 0 then sequential mmp write failures will
119 * cause the pool to be suspended. This occurs when
120 * zfs_multihost_fail_intervals * zfs_multihost_interval milliseconds have
121 * passed since the last successful mmp write. This guarantees the activity
122 * test will see mmp writes if the
125 uint_t zfs_multihost_fail_intervals
= MMP_DEFAULT_FAIL_INTERVALS
;
127 char *mmp_tag
= "mmp_write_uberblock";
128 static void mmp_thread(void *arg
);
133 mmp_thread_t
*mmp
= &spa
->spa_mmp
;
135 mutex_init(&mmp
->mmp_thread_lock
, NULL
, MUTEX_DEFAULT
, NULL
);
136 cv_init(&mmp
->mmp_thread_cv
, NULL
, CV_DEFAULT
, NULL
);
137 mutex_init(&mmp
->mmp_io_lock
, NULL
, MUTEX_DEFAULT
, NULL
);
143 mmp_thread_t
*mmp
= &spa
->spa_mmp
;
145 mutex_destroy(&mmp
->mmp_thread_lock
);
146 cv_destroy(&mmp
->mmp_thread_cv
);
147 mutex_destroy(&mmp
->mmp_io_lock
);
151 mmp_thread_enter(mmp_thread_t
*mmp
, callb_cpr_t
*cpr
)
153 CALLB_CPR_INIT(cpr
, &mmp
->mmp_thread_lock
, callb_generic_cpr
, FTAG
);
154 mutex_enter(&mmp
->mmp_thread_lock
);
158 mmp_thread_exit(mmp_thread_t
*mmp
, kthread_t
**mpp
, callb_cpr_t
*cpr
)
160 ASSERT(*mpp
!= NULL
);
162 cv_broadcast(&mmp
->mmp_thread_cv
);
163 CALLB_CPR_EXIT(cpr
); /* drops &mmp->mmp_thread_lock */
168 mmp_thread_start(spa_t
*spa
)
170 mmp_thread_t
*mmp
= &spa
->spa_mmp
;
172 if (spa_writeable(spa
)) {
173 mutex_enter(&mmp
->mmp_thread_lock
);
174 if (!mmp
->mmp_thread
) {
175 dprintf("mmp_thread_start pool %s\n",
177 mmp
->mmp_thread
= thread_create(NULL
, 0, mmp_thread
,
178 spa
, 0, &p0
, TS_RUN
, defclsyspri
);
180 mutex_exit(&mmp
->mmp_thread_lock
);
185 mmp_thread_stop(spa_t
*spa
)
187 mmp_thread_t
*mmp
= &spa
->spa_mmp
;
189 mutex_enter(&mmp
->mmp_thread_lock
);
190 mmp
->mmp_thread_exiting
= 1;
191 cv_broadcast(&mmp
->mmp_thread_cv
);
193 while (mmp
->mmp_thread
) {
194 cv_wait(&mmp
->mmp_thread_cv
, &mmp
->mmp_thread_lock
);
196 mutex_exit(&mmp
->mmp_thread_lock
);
198 ASSERT(mmp
->mmp_thread
== NULL
);
199 mmp
->mmp_thread_exiting
= 0;
203 * Choose a leaf vdev to write an MMP block to. It must not have an
204 * outstanding mmp write (if so then there is a problem, and a new write will
205 * also block). If there is no usable leaf in this subtree return NULL,
206 * otherwise return a pointer to the leaf.
208 * When walking the subtree, a random child is chosen as the starting point so
209 * that when the tree is healthy, the leaf chosen will be random with even
210 * distribution. If there are unhealthy vdevs in the tree, the distribution
211 * will be really poor only if a large proportion of the vdevs are unhealthy,
212 * in which case there are other more pressing problems.
215 mmp_random_leaf(vdev_t
*vd
)
219 if (!vdev_writeable(vd
))
222 if (vd
->vdev_ops
->vdev_op_leaf
)
223 return (vd
->vdev_mmp_pending
== 0 ? vd
: NULL
);
225 child_idx
= spa_get_random(vd
->vdev_children
);
226 for (int offset
= vd
->vdev_children
; offset
> 0; offset
--) {
228 vdev_t
*child
= vd
->vdev_child
[(child_idx
+ offset
) %
231 leaf
= mmp_random_leaf(child
);
240 mmp_write_done(zio_t
*zio
)
242 spa_t
*spa
= zio
->io_spa
;
243 vdev_t
*vd
= zio
->io_vd
;
244 mmp_thread_t
*mts
= zio
->io_private
;
246 mutex_enter(&mts
->mmp_io_lock
);
247 vd
->vdev_mmp_pending
= 0;
253 * Mmp writes are queued on a fixed schedule, but under many
254 * circumstances, such as a busy device or faulty hardware,
255 * the writes will complete at variable, much longer,
256 * intervals. In these cases, another node checking for
257 * activity must wait longer to account for these delays.
259 * The mmp_delay is calculated as a decaying average of the interval
260 * between completed mmp writes. This is used to predict how long
261 * the import must wait to detect activity in the pool, before
262 * concluding it is not in use.
264 * Do not set mmp_delay if the multihost property is not on,
265 * so as not to trigger an activity check on import.
267 if (spa_multihost(spa
)) {
268 hrtime_t delay
= gethrtime() - mts
->mmp_last_write
;
270 if (delay
> mts
->mmp_delay
)
271 mts
->mmp_delay
= delay
;
273 mts
->mmp_delay
= (delay
+ mts
->mmp_delay
* 127) /
278 mts
->mmp_last_write
= gethrtime();
281 mutex_exit(&mts
->mmp_io_lock
);
282 spa_config_exit(spa
, SCL_STATE
, mmp_tag
);
284 abd_free(zio
->io_abd
);
288 * When the uberblock on-disk is updated by a spa_sync,
289 * creating a new "best" uberblock, update the one stored
290 * in the mmp thread state, used for mmp writes.
293 mmp_update_uberblock(spa_t
*spa
, uberblock_t
*ub
)
295 mmp_thread_t
*mmp
= &spa
->spa_mmp
;
297 mutex_enter(&mmp
->mmp_io_lock
);
299 mmp
->mmp_ub
.ub_timestamp
= gethrestime_sec();
300 mutex_exit(&mmp
->mmp_io_lock
);
304 * Choose a random vdev, label, and MMP block, and write over it
305 * with a copy of the last-synced uberblock, whose timestamp
306 * has been updated to reflect that the pool is in use.
309 mmp_write_uberblock(spa_t
*spa
)
311 int flags
= ZIO_FLAG_CONFIG_WRITER
| ZIO_FLAG_CANFAIL
;
312 mmp_thread_t
*mmp
= &spa
->spa_mmp
;
318 spa_config_enter(spa
, SCL_STATE
, mmp_tag
, RW_READER
);
319 vd
= mmp_random_leaf(spa
->spa_root_vdev
);
321 spa_config_exit(spa
, SCL_STATE
, FTAG
);
325 mutex_enter(&mmp
->mmp_io_lock
);
327 if (mmp
->mmp_zio_root
== NULL
)
328 mmp
->mmp_zio_root
= zio_root(spa
, NULL
, NULL
,
329 flags
| ZIO_FLAG_GODFATHER
);
332 ub
->ub_timestamp
= gethrestime_sec();
333 ub
->ub_mmp_magic
= MMP_MAGIC
;
334 ub
->ub_mmp_delay
= mmp
->mmp_delay
;
335 vd
->vdev_mmp_pending
= gethrtime();
337 zio_t
*zio
= zio_null(mmp
->mmp_zio_root
, spa
, NULL
, NULL
, NULL
, flags
);
338 abd_t
*ub_abd
= abd_alloc_for_io(VDEV_UBERBLOCK_SIZE(vd
), B_TRUE
);
339 abd_zero(ub_abd
, VDEV_UBERBLOCK_SIZE(vd
));
340 abd_copy_from_buf(ub_abd
, ub
, sizeof (uberblock_t
));
342 mutex_exit(&mmp
->mmp_io_lock
);
344 offset
= VDEV_UBERBLOCK_OFFSET(vd
, VDEV_UBERBLOCK_COUNT(vd
) -
345 MMP_BLOCKS_PER_LABEL
+ spa_get_random(MMP_BLOCKS_PER_LABEL
));
347 label
= spa_get_random(VDEV_LABELS
);
348 vdev_label_write(zio
, vd
, label
, ub_abd
, offset
,
349 VDEV_UBERBLOCK_SIZE(vd
), mmp_write_done
, mmp
,
350 flags
| ZIO_FLAG_DONT_PROPAGATE
);
352 spa_mmp_history_add(ub
->ub_txg
, ub
->ub_timestamp
, ub
->ub_mmp_delay
, vd
,
359 mmp_thread(void *arg
)
361 spa_t
*spa
= (spa_t
*)arg
;
362 mmp_thread_t
*mmp
= &spa
->spa_mmp
;
363 boolean_t last_spa_suspended
= spa_suspended(spa
);
364 boolean_t last_spa_multihost
= spa_multihost(spa
);
366 hrtime_t max_fail_ns
= zfs_multihost_fail_intervals
*
367 MSEC2NSEC(MAX(zfs_multihost_interval
, MMP_MIN_INTERVAL
));
369 mmp_thread_enter(mmp
, &cpr
);
372 * The mmp_write_done() function calculates mmp_delay based on the
373 * prior value of mmp_delay and the elapsed time since the last write.
374 * For the first mmp write, there is no "last write", so we start
375 * with fake, but reasonable, default non-zero values.
377 mmp
->mmp_delay
= MSEC2NSEC(MAX(zfs_multihost_interval
,
378 MMP_MIN_INTERVAL
)) / MAX(vdev_count_leaves(spa
), 1);
379 mmp
->mmp_last_write
= gethrtime() - mmp
->mmp_delay
;
381 while (!mmp
->mmp_thread_exiting
) {
382 uint64_t mmp_fail_intervals
= zfs_multihost_fail_intervals
;
383 uint64_t mmp_interval
= MSEC2NSEC(
384 MAX(zfs_multihost_interval
, MMP_MIN_INTERVAL
));
385 boolean_t suspended
= spa_suspended(spa
);
386 boolean_t multihost
= spa_multihost(spa
);
387 hrtime_t start
, next_time
;
391 next_time
= start
+ mmp_interval
/
392 MAX(vdev_count_leaves(spa
), 1);
394 next_time
= start
+ MSEC2NSEC(MMP_DEFAULT_INTERVAL
);
398 * When MMP goes off => on, or spa goes suspended =>
399 * !suspended, we know no writes occurred recently. We
400 * update mmp_last_write to give us some time to try.
402 if ((!last_spa_multihost
&& multihost
) ||
403 (last_spa_suspended
&& !suspended
)) {
404 mutex_enter(&mmp
->mmp_io_lock
);
405 mmp
->mmp_last_write
= gethrtime();
406 mutex_exit(&mmp
->mmp_io_lock
);
407 } else if (last_spa_multihost
&& !multihost
) {
408 mutex_enter(&mmp
->mmp_io_lock
);
410 mutex_exit(&mmp
->mmp_io_lock
);
412 last_spa_multihost
= multihost
;
413 last_spa_suspended
= suspended
;
416 * Smooth max_fail_ns when its factors are decreased, because
417 * making (max_fail_ns < mmp_interval) results in the pool being
418 * immediately suspended before writes can occur at the new
421 if ((mmp_interval
* mmp_fail_intervals
) < max_fail_ns
) {
422 max_fail_ns
= ((31 * max_fail_ns
) + (mmp_interval
*
423 mmp_fail_intervals
)) / 32;
425 max_fail_ns
= mmp_interval
* mmp_fail_intervals
;
429 * Suspend the pool if no MMP write has succeeded in over
430 * mmp_interval * mmp_fail_intervals nanoseconds.
432 if (!suspended
&& mmp_fail_intervals
&& multihost
&&
433 (start
- mmp
->mmp_last_write
) > max_fail_ns
) {
434 cmn_err(CE_WARN
, "MMP writes to pool '%s' have not "
435 "succeeded in over %llus; suspending pool",
437 NSEC2SEC(start
- mmp
->mmp_last_write
));
438 zio_suspend(spa
, NULL
);
442 mmp_write_uberblock(spa
);
444 CALLB_CPR_SAFE_BEGIN(&cpr
);
445 (void) cv_timedwait_sig(&mmp
->mmp_thread_cv
,
446 &mmp
->mmp_thread_lock
, ddi_get_lbolt() +
447 ((next_time
- gethrtime()) / (NANOSEC
/ hz
)));
448 CALLB_CPR_SAFE_END(&cpr
, &mmp
->mmp_thread_lock
);
451 /* Outstanding writes are allowed to complete. */
452 if (mmp
->mmp_zio_root
)
453 zio_wait(mmp
->mmp_zio_root
);
455 mmp
->mmp_zio_root
= NULL
;
456 mmp_thread_exit(mmp
, &mmp
->mmp_thread
, &cpr
);
460 * Signal the MMP thread to wake it, when it is sleeping on
461 * its cv. Used when some module parameter has changed and
462 * we want the thread to know about it.
463 * Only signal if the pool is active and mmp thread is
464 * running, otherwise there is no thread to wake.
467 mmp_signal_thread(spa_t
*spa
)
469 mmp_thread_t
*mmp
= &spa
->spa_mmp
;
471 mutex_enter(&mmp
->mmp_thread_lock
);
473 cv_broadcast(&mmp
->mmp_thread_cv
);
474 mutex_exit(&mmp
->mmp_thread_lock
);
478 mmp_signal_all_threads(void)
482 mutex_enter(&spa_namespace_lock
);
483 while ((spa
= spa_next(spa
))) {
484 if (spa
->spa_state
== POOL_STATE_ACTIVE
)
485 mmp_signal_thread(spa
);
487 mutex_exit(&spa_namespace_lock
);
490 #if defined(_KERNEL) && defined(HAVE_SPL)
491 #include <linux/mod_compat.h>
494 param_set_multihost_interval(const char *val
, zfs_kernel_param_t
*kp
)
498 ret
= param_set_ulong(val
, kp
);
502 mmp_signal_all_threads();
508 module_param(zfs_multihost_fail_intervals
, uint
, 0644);
509 MODULE_PARM_DESC(zfs_multihost_fail_intervals
,
510 "Max allowed period without a successful mmp write");
512 module_param_call(zfs_multihost_interval
, param_set_multihost_interval
,
513 param_get_ulong
, &zfs_multihost_interval
, 0644);
514 MODULE_PARM_DESC(zfs_multihost_interval
,
515 "Milliseconds between mmp writes to each leaf");
517 module_param(zfs_multihost_import_intervals
, uint
, 0644);
518 MODULE_PARM_DESC(zfs_multihost_import_intervals
,
519 "Number of zfs_multihost_interval periods to wait for activity");