]> git.proxmox.com Git - mirror_zfs.git/blame - module/zfs/mmp.c
OpenZFS 8607 - variable set but not used
[mirror_zfs.git] / module / zfs / mmp.c
CommitLineData
379ca9cf
OF
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21/*
22 * Copyright (c) 2017 by Lawrence Livermore National Security, LLC.
23 */
24
25#include <sys/abd.h>
26#include <sys/mmp.h>
27#include <sys/spa.h>
28#include <sys/spa_impl.h>
29#include <sys/vdev.h>
30#include <sys/vdev_impl.h>
31#include <sys/zfs_context.h>
32#include <sys/callb.h>
33
34/*
35 * Multi-Modifier Protection (MMP) attempts to prevent a user from importing
36 * or opening a pool on more than one host at a time. In particular, it
37 * prevents "zpool import -f" on a host from succeeding while the pool is
38 * already imported on another host. There are many other ways in which a
39 * device could be used by two hosts for different purposes at the same time
40 * resulting in pool damage. This implementation does not attempt to detect
41 * those cases.
42 *
43 * MMP operates by ensuring there are frequent visible changes on disk (a
44 * "heartbeat") at all times. And by altering the import process to check
45 * for these changes and failing the import when they are detected. This
46 * functionality is enabled by setting the 'multihost' pool property to on.
47 *
48 * Uberblocks written by the txg_sync thread always go into the first
49 * (N-MMP_BLOCKS_PER_LABEL) slots, the remaining slots are reserved for MMP.
50 * They are used to hold uberblocks which are exactly the same as the last
51 * synced uberblock except that the ub_timestamp is frequently updated.
52 * Like all other uberblocks, the slot is written with an embedded checksum,
53 * and slots with invalid checksums are ignored. This provides the
54 * "heartbeat", with no risk of overwriting good uberblocks that must be
55 * preserved, e.g. previous txgs and associated block pointers.
56 *
57 * Two optional fields are added to uberblock structure: ub_mmp_magic and
58 * ub_mmp_delay. The magic field allows zfs to tell whether ub_mmp_delay is
59 * valid. The delay field is a decaying average of the amount of time between
60 * completion of successive MMP writes, in nanoseconds. It is used to predict
61 * how long the import must wait to detect activity in the pool, before
62 * concluding it is not in use.
63 *
64 * During import an activity test may now be performed to determine if
65 * the pool is in use. The activity test is typically required if the
66 * ZPOOL_CONFIG_HOSTID does not match the system hostid, the pool state is
67 * POOL_STATE_ACTIVE, and the pool is not a root pool.
68 *
69 * The activity test finds the "best" uberblock (highest txg & timestamp),
70 * waits some time, and then finds the "best" uberblock again. If the txg
71 * and timestamp in both "best" uberblocks do not match, the pool is in use
72 * by another host and the import fails. Since the granularity of the
73 * timestamp is in seconds this activity test must take a bare minimum of one
74 * second. In order to assure the accuracy of the activity test, the default
75 * values result in an activity test duration of 10x the mmp write interval.
76 *
77 * The "zpool import" activity test can be expected to take a minimum time of
78 * zfs_multihost_import_intervals * zfs_multihost_interval milliseconds. If the
79 * "best" uberblock has a valid ub_mmp_delay field, then the duration of the
80 * test may take longer if MMP writes were occurring less frequently than
81 * expected. Additionally, the duration is then extended by a random 25% to
82 * attempt to to detect simultaneous imports. For example, if both partner
83 * hosts are rebooted at the same time and automatically attempt to import the
84 * pool.
85 */
86
87/*
88 * Used to control the frequency of mmp writes which are performed when the
89 * 'multihost' pool property is on. This is one factor used to determine the
90 * length of the activity check during import.
91 *
92 * The mmp write period is zfs_multihost_interval / leaf-vdevs milliseconds.
93 * This means that on average an mmp write will be issued for each leaf vdev
94 * every zfs_multihost_interval milliseconds. In practice, the observed period
95 * can vary with the I/O load and this observed value is the delay which is
96 * stored in the uberblock. The minimum allowed value is 100 ms.
97 */
98ulong_t zfs_multihost_interval = MMP_DEFAULT_INTERVAL;
99
100/*
101 * Used to control the duration of the activity test on import. Smaller values
102 * of zfs_multihost_import_intervals will reduce the import time but increase
103 * the risk of failing to detect an active pool. The total activity check time
104 * is never allowed to drop below one second. A value of 0 is ignored and
105 * treated as if it was set to 1.
106 */
107uint_t zfs_multihost_import_intervals = MMP_DEFAULT_IMPORT_INTERVALS;
108
109/*
110 * Controls the behavior of the pool when mmp write failures are detected.
111 *
112 * When zfs_multihost_fail_intervals = 0 then mmp write failures are ignored.
113 * The failures will still be reported to the ZED which depending on its
114 * configuration may take action such as suspending the pool or taking a
115 * device offline.
116 *
117 * When zfs_multihost_fail_intervals > 0 then sequential mmp write failures will
118 * cause the pool to be suspended. This occurs when
119 * zfs_multihost_fail_intervals * zfs_multihost_interval milliseconds have
120 * passed since the last successful mmp write. This guarantees the activity
121 * test will see mmp writes if the
122 * pool is imported.
123 */
124uint_t zfs_multihost_fail_intervals = MMP_DEFAULT_FAIL_INTERVALS;
125
c25b8f99 126static void mmp_thread(void *arg);
379ca9cf
OF
127
128void
129mmp_init(spa_t *spa)
130{
131 mmp_thread_t *mmp = &spa->spa_mmp;
132
133 mutex_init(&mmp->mmp_thread_lock, NULL, MUTEX_DEFAULT, NULL);
134 cv_init(&mmp->mmp_thread_cv, NULL, CV_DEFAULT, NULL);
135 mutex_init(&mmp->mmp_io_lock, NULL, MUTEX_DEFAULT, NULL);
136}
137
138void
139mmp_fini(spa_t *spa)
140{
141 mmp_thread_t *mmp = &spa->spa_mmp;
142
143 mutex_destroy(&mmp->mmp_thread_lock);
144 cv_destroy(&mmp->mmp_thread_cv);
145 mutex_destroy(&mmp->mmp_io_lock);
146}
147
148static void
149mmp_thread_enter(mmp_thread_t *mmp, callb_cpr_t *cpr)
150{
151 CALLB_CPR_INIT(cpr, &mmp->mmp_thread_lock, callb_generic_cpr, FTAG);
152 mutex_enter(&mmp->mmp_thread_lock);
153}
154
155static void
156mmp_thread_exit(mmp_thread_t *mmp, kthread_t **mpp, callb_cpr_t *cpr)
157{
158 ASSERT(*mpp != NULL);
159 *mpp = NULL;
160 cv_broadcast(&mmp->mmp_thread_cv);
161 CALLB_CPR_EXIT(cpr); /* drops &mmp->mmp_thread_lock */
162 thread_exit();
163}
164
165void
166mmp_thread_start(spa_t *spa)
167{
168 mmp_thread_t *mmp = &spa->spa_mmp;
169
170 if (spa_writeable(spa)) {
171 mutex_enter(&mmp->mmp_thread_lock);
172 if (!mmp->mmp_thread) {
173 dprintf("mmp_thread_start pool %s\n",
174 spa->spa_name);
175 mmp->mmp_thread = thread_create(NULL, 0, mmp_thread,
176 spa, 0, &p0, TS_RUN, defclsyspri);
177 }
178 mutex_exit(&mmp->mmp_thread_lock);
179 }
180}
181
182void
183mmp_thread_stop(spa_t *spa)
184{
185 mmp_thread_t *mmp = &spa->spa_mmp;
186
187 mutex_enter(&mmp->mmp_thread_lock);
188 mmp->mmp_thread_exiting = 1;
189 cv_broadcast(&mmp->mmp_thread_cv);
190
191 while (mmp->mmp_thread) {
192 cv_wait(&mmp->mmp_thread_cv, &mmp->mmp_thread_lock);
193 }
194 mutex_exit(&mmp->mmp_thread_lock);
195
196 ASSERT(mmp->mmp_thread == NULL);
197 mmp->mmp_thread_exiting = 0;
198}
199
200/*
d410c6d9
OF
201 * Choose a leaf vdev to write an MMP block to. It must not have an
202 * outstanding mmp write (if so then there is a problem, and a new write will
203 * also block). If there is no usable leaf in this subtree return NULL,
204 * otherwise return a pointer to the leaf.
379ca9cf 205 *
d410c6d9
OF
206 * When walking the subtree, a random child is chosen as the starting point so
207 * that when the tree is healthy, the leaf chosen will be random with even
208 * distribution. If there are unhealthy vdevs in the tree, the distribution
209 * will be really poor only if a large proportion of the vdevs are unhealthy,
210 * in which case there are other more pressing problems.
379ca9cf
OF
211 */
212static vdev_t *
d410c6d9 213mmp_random_leaf(vdev_t *vd)
379ca9cf 214{
d410c6d9 215 int child_idx;
379ca9cf 216
d410c6d9 217 if (!vdev_writeable(vd))
379ca9cf
OF
218 return (NULL);
219
d410c6d9
OF
220 if (vd->vdev_ops->vdev_op_leaf)
221 return (vd->vdev_mmp_pending == 0 ? vd : NULL);
379ca9cf 222
d410c6d9
OF
223 child_idx = spa_get_random(vd->vdev_children);
224 for (int offset = vd->vdev_children; offset > 0; offset--) {
225 vdev_t *leaf;
226 vdev_t *child = vd->vdev_child[(child_idx + offset) %
227 vd->vdev_children];
379ca9cf 228
d410c6d9
OF
229 leaf = mmp_random_leaf(child);
230 if (leaf)
231 return (leaf);
379ca9cf 232 }
d410c6d9
OF
233
234 return (NULL);
379ca9cf
OF
235}
236
237static void
238mmp_write_done(zio_t *zio)
239{
240 spa_t *spa = zio->io_spa;
241 vdev_t *vd = zio->io_vd;
242 mmp_thread_t *mts = zio->io_private;
243
244 mutex_enter(&mts->mmp_io_lock);
245 vd->vdev_mmp_pending = 0;
246
247 if (zio->io_error)
248 goto unlock;
249
250 /*
251 * Mmp writes are queued on a fixed schedule, but under many
252 * circumstances, such as a busy device or faulty hardware,
253 * the writes will complete at variable, much longer,
254 * intervals. In these cases, another node checking for
255 * activity must wait longer to account for these delays.
256 *
257 * The mmp_delay is calculated as a decaying average of the interval
258 * between completed mmp writes. This is used to predict how long
259 * the import must wait to detect activity in the pool, before
260 * concluding it is not in use.
261 *
262 * Do not set mmp_delay if the multihost property is not on,
263 * so as not to trigger an activity check on import.
264 */
265 if (spa_multihost(spa)) {
266 hrtime_t delay = gethrtime() - mts->mmp_last_write;
267
268 if (delay > mts->mmp_delay)
269 mts->mmp_delay = delay;
270 else
271 mts->mmp_delay = (delay + mts->mmp_delay * 127) /
272 128;
273 } else {
274 mts->mmp_delay = 0;
275 }
276 mts->mmp_last_write = gethrtime();
277
278unlock:
279 mutex_exit(&mts->mmp_io_lock);
ffb195c2 280 spa_config_exit(spa, SCL_STATE, FTAG);
379ca9cf
OF
281
282 abd_free(zio->io_abd);
283}
284
285/*
286 * When the uberblock on-disk is updated by a spa_sync,
287 * creating a new "best" uberblock, update the one stored
288 * in the mmp thread state, used for mmp writes.
289 */
290void
291mmp_update_uberblock(spa_t *spa, uberblock_t *ub)
292{
293 mmp_thread_t *mmp = &spa->spa_mmp;
294
295 mutex_enter(&mmp->mmp_io_lock);
296 mmp->mmp_ub = *ub;
297 mmp->mmp_ub.ub_timestamp = gethrestime_sec();
298 mutex_exit(&mmp->mmp_io_lock);
299}
300
301/*
302 * Choose a random vdev, label, and MMP block, and write over it
303 * with a copy of the last-synced uberblock, whose timestamp
304 * has been updated to reflect that the pool is in use.
305 */
306static void
307mmp_write_uberblock(spa_t *spa)
308{
309 int flags = ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_CANFAIL;
310 mmp_thread_t *mmp = &spa->spa_mmp;
311 uberblock_t *ub;
312 vdev_t *vd;
313 int label;
314 uint64_t offset;
315
ffb195c2 316 spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
d410c6d9
OF
317 vd = mmp_random_leaf(spa->spa_root_vdev);
318 if (vd == NULL) {
ffb195c2 319 spa_config_exit(spa, SCL_STATE, FTAG);
379ca9cf 320 return;
ffb195c2 321 }
379ca9cf
OF
322
323 mutex_enter(&mmp->mmp_io_lock);
324
325 if (mmp->mmp_zio_root == NULL)
326 mmp->mmp_zio_root = zio_root(spa, NULL, NULL,
327 flags | ZIO_FLAG_GODFATHER);
328
329 ub = &mmp->mmp_ub;
330 ub->ub_timestamp = gethrestime_sec();
331 ub->ub_mmp_magic = MMP_MAGIC;
332 ub->ub_mmp_delay = mmp->mmp_delay;
333 vd->vdev_mmp_pending = gethrtime();
334
335 zio_t *zio = zio_null(mmp->mmp_zio_root, spa, NULL, NULL, NULL, flags);
336 abd_t *ub_abd = abd_alloc_for_io(VDEV_UBERBLOCK_SIZE(vd), B_TRUE);
337 abd_zero(ub_abd, VDEV_UBERBLOCK_SIZE(vd));
338 abd_copy_from_buf(ub_abd, ub, sizeof (uberblock_t));
339
340 mutex_exit(&mmp->mmp_io_lock);
341
342 offset = VDEV_UBERBLOCK_OFFSET(vd, VDEV_UBERBLOCK_COUNT(vd) -
343 MMP_BLOCKS_PER_LABEL + spa_get_random(MMP_BLOCKS_PER_LABEL));
344
345 label = spa_get_random(VDEV_LABELS);
346 vdev_label_write(zio, vd, label, ub_abd, offset,
347 VDEV_UBERBLOCK_SIZE(vd), mmp_write_done, mmp,
348 flags | ZIO_FLAG_DONT_PROPAGATE);
349
350 spa_mmp_history_add(ub->ub_txg, ub->ub_timestamp, ub->ub_mmp_delay, vd,
351 label);
352
353 zio_nowait(zio);
354}
355
356static void
c25b8f99 357mmp_thread(void *arg)
379ca9cf 358{
c25b8f99 359 spa_t *spa = (spa_t *)arg;
379ca9cf
OF
360 mmp_thread_t *mmp = &spa->spa_mmp;
361 boolean_t last_spa_suspended = spa_suspended(spa);
362 boolean_t last_spa_multihost = spa_multihost(spa);
363 callb_cpr_t cpr;
364 hrtime_t max_fail_ns = zfs_multihost_fail_intervals *
365 MSEC2NSEC(MAX(zfs_multihost_interval, MMP_MIN_INTERVAL));
366
367 mmp_thread_enter(mmp, &cpr);
368
369 /*
370 * The mmp_write_done() function calculates mmp_delay based on the
371 * prior value of mmp_delay and the elapsed time since the last write.
372 * For the first mmp write, there is no "last write", so we start
373 * with fake, but reasonable, default non-zero values.
374 */
375 mmp->mmp_delay = MSEC2NSEC(MAX(zfs_multihost_interval,
802ae562 376 MMP_MIN_INTERVAL)) / MAX(vdev_count_leaves(spa), 1);
379ca9cf
OF
377 mmp->mmp_last_write = gethrtime() - mmp->mmp_delay;
378
379 while (!mmp->mmp_thread_exiting) {
380 uint64_t mmp_fail_intervals = zfs_multihost_fail_intervals;
381 uint64_t mmp_interval = MSEC2NSEC(
382 MAX(zfs_multihost_interval, MMP_MIN_INTERVAL));
383 boolean_t suspended = spa_suspended(spa);
384 boolean_t multihost = spa_multihost(spa);
385 hrtime_t start, next_time;
386
387 start = gethrtime();
388 if (multihost) {
389 next_time = start + mmp_interval /
802ae562 390 MAX(vdev_count_leaves(spa), 1);
379ca9cf
OF
391 } else {
392 next_time = start + MSEC2NSEC(MMP_DEFAULT_INTERVAL);
393 }
394
395 /*
396 * When MMP goes off => on, or spa goes suspended =>
397 * !suspended, we know no writes occurred recently. We
398 * update mmp_last_write to give us some time to try.
399 */
400 if ((!last_spa_multihost && multihost) ||
401 (last_spa_suspended && !suspended)) {
402 mutex_enter(&mmp->mmp_io_lock);
403 mmp->mmp_last_write = gethrtime();
404 mutex_exit(&mmp->mmp_io_lock);
405 } else if (last_spa_multihost && !multihost) {
406 mutex_enter(&mmp->mmp_io_lock);
407 mmp->mmp_delay = 0;
408 mutex_exit(&mmp->mmp_io_lock);
409 }
410 last_spa_multihost = multihost;
411 last_spa_suspended = suspended;
412
413 /*
414 * Smooth max_fail_ns when its factors are decreased, because
415 * making (max_fail_ns < mmp_interval) results in the pool being
416 * immediately suspended before writes can occur at the new
417 * higher frequency.
418 */
419 if ((mmp_interval * mmp_fail_intervals) < max_fail_ns) {
420 max_fail_ns = ((31 * max_fail_ns) + (mmp_interval *
421 mmp_fail_intervals)) / 32;
422 } else {
423 max_fail_ns = mmp_interval * mmp_fail_intervals;
424 }
425
426 /*
427 * Suspend the pool if no MMP write has succeeded in over
428 * mmp_interval * mmp_fail_intervals nanoseconds.
429 */
430 if (!suspended && mmp_fail_intervals && multihost &&
431 (start - mmp->mmp_last_write) > max_fail_ns) {
432 zio_suspend(spa, NULL);
433 }
434
ffb195c2 435 if (multihost)
379ca9cf 436 mmp_write_uberblock(spa);
379ca9cf
OF
437
438 CALLB_CPR_SAFE_BEGIN(&cpr);
439 (void) cv_timedwait_sig(&mmp->mmp_thread_cv,
440 &mmp->mmp_thread_lock, ddi_get_lbolt() +
b6e5c403 441 ((next_time - gethrtime()) / (NANOSEC / hz)));
379ca9cf
OF
442 CALLB_CPR_SAFE_END(&cpr, &mmp->mmp_thread_lock);
443 }
444
445 /* Outstanding writes are allowed to complete. */
446 if (mmp->mmp_zio_root)
447 zio_wait(mmp->mmp_zio_root);
448
449 mmp->mmp_zio_root = NULL;
450 mmp_thread_exit(mmp, &mmp->mmp_thread, &cpr);
451}
452
0582e403
OF
453/*
454 * Signal the MMP thread to wake it, when it is sleeping on
455 * its cv. Used when some module parameter has changed and
456 * we want the thread to know about it.
457 * Only signal if the pool is active and mmp thread is
458 * running, otherwise there is no thread to wake.
459 */
460static void
461mmp_signal_thread(spa_t *spa)
462{
463 mmp_thread_t *mmp = &spa->spa_mmp;
464
465 mutex_enter(&mmp->mmp_thread_lock);
466 if (mmp->mmp_thread)
467 cv_broadcast(&mmp->mmp_thread_cv);
468 mutex_exit(&mmp->mmp_thread_lock);
469}
470
471void
472mmp_signal_all_threads(void)
473{
474 spa_t *spa = NULL;
475
476 mutex_enter(&spa_namespace_lock);
477 while ((spa = spa_next(spa))) {
478 if (spa->spa_state == POOL_STATE_ACTIVE)
479 mmp_signal_thread(spa);
480 }
481 mutex_exit(&spa_namespace_lock);
482}
483
379ca9cf 484#if defined(_KERNEL) && defined(HAVE_SPL)
0582e403
OF
485#include <linux/mod_compat.h>
486
487static int
488param_set_multihost_interval(const char *val, zfs_kernel_param_t *kp)
489{
490 int ret;
491
492 ret = param_set_ulong(val, kp);
493 if (ret < 0)
494 return (ret);
495
496 mmp_signal_all_threads();
497
498 return (ret);
499}
500
379ca9cf
OF
501/* BEGIN CSTYLED */
502module_param(zfs_multihost_fail_intervals, uint, 0644);
503MODULE_PARM_DESC(zfs_multihost_fail_intervals,
504 "Max allowed period without a successful mmp write");
505
0582e403
OF
506module_param_call(zfs_multihost_interval, param_set_multihost_interval,
507 param_get_ulong, &zfs_multihost_interval, 0644);
379ca9cf
OF
508MODULE_PARM_DESC(zfs_multihost_interval,
509 "Milliseconds between mmp writes to each leaf");
510
511module_param(zfs_multihost_import_intervals, uint, 0644);
512MODULE_PARM_DESC(zfs_multihost_import_intervals,
513 "Number of zfs_multihost_interval periods to wait for activity");
514/* END CSTYLED */
515#endif