]>
Commit | Line | Data |
---|---|---|
34dc7c2f BB |
1 | /* |
2 | * CDDL HEADER START | |
3 | * | |
4 | * The contents of this file are subject to the terms of the | |
5 | * Common Development and Distribution License (the "License"). | |
6 | * You may not use this file except in compliance with the License. | |
7 | * | |
8 | * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE | |
1d3ba0bf | 9 | * or https://opensource.org/licenses/CDDL-1.0. |
34dc7c2f BB |
10 | * See the License for the specific language governing permissions |
11 | * and limitations under the License. | |
12 | * | |
13 | * When distributing Covered Code, include this CDDL HEADER in each | |
14 | * file and include the License file at usr/src/OPENSOLARIS.LICENSE. | |
15 | * If applicable, add the following below this CDDL HEADER, with the | |
16 | * fields enclosed by brackets "[]" replaced with your own identifying | |
17 | * information: Portions Copyright [yyyy] [name of copyright owner] | |
18 | * | |
19 | * CDDL HEADER END | |
20 | */ | |
21 | /* | |
428870ff | 22 | * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. |
26ef0cc7 | 23 | * Copyright (c) 2012, 2015 by Delphix. All rights reserved. |
0241e491 | 24 | * Copyright (c) 2017, Intel Corporation. |
c183d164 | 25 | * Copyright (c) 2024, Klara Inc. |
34dc7c2f BB |
26 | */ |
27 | ||
34dc7c2f BB |
28 | /* |
29 | * ZFS fault injection | |
30 | * | |
31 | * To handle fault injection, we keep track of a series of zinject_record_t | |
32 | * structures which describe which logical block(s) should be injected with a | |
33 | * fault. These are kept in a global list. Each record corresponds to a given | |
34 | * spa_t and maintains a special hold on the spa_t so that it cannot be deleted | |
35 | * or exported while the injection record exists. | |
36 | * | |
37 | * Device level injection is done using the 'zi_guid' field. If this is set, it | |
38 | * means that the error is destined for a particular device, not a piece of | |
39 | * data. | |
40 | * | |
41 | * This is a rather poor data structure and algorithm, but we don't expect more | |
42 | * than a few faults at any one time, so it should be sufficient for our needs. | |
43 | */ | |
44 | ||
45 | #include <sys/arc.h> | |
6a8fd57f | 46 | #include <sys/zio.h> |
34dc7c2f | 47 | #include <sys/zfs_ioctl.h> |
34dc7c2f | 48 | #include <sys/vdev_impl.h> |
428870ff | 49 | #include <sys/dmu_objset.h> |
e89f1295 | 50 | #include <sys/dsl_dataset.h> |
b128c09f | 51 | #include <sys/fs/zfs.h> |
34dc7c2f | 52 | |
c409e464 | 53 | uint32_t zio_injection_enabled = 0; |
34dc7c2f | 54 | |
26ef0cc7 TH |
55 | /* |
56 | * Data describing each zinject handler registered on the system, and | |
57 | * contains the list node linking the handler in the global zinject | |
58 | * handler list. | |
59 | */ | |
34dc7c2f BB |
60 | typedef struct inject_handler { |
61 | int zi_id; | |
62 | spa_t *zi_spa; | |
c183d164 | 63 | char *zi_spa_name; /* ZINJECT_DELAY_IMPORT only */ |
34dc7c2f | 64 | zinject_record_t zi_record; |
26ef0cc7 TH |
65 | uint64_t *zi_lanes; |
66 | int zi_next_lane; | |
34dc7c2f BB |
67 | list_node_t zi_link; |
68 | } inject_handler_t; | |
69 | ||
26ef0cc7 TH |
70 | /* |
71 | * List of all zinject handlers registered on the system, protected by | |
72 | * the inject_lock defined below. | |
73 | */ | |
34dc7c2f | 74 | static list_t inject_handlers; |
26ef0cc7 TH |
75 | |
76 | /* | |
77 | * This protects insertion into, and traversal of, the inject handler | |
78 | * list defined above; as well as the inject_delay_count. Any time a | |
79 | * handler is inserted or removed from the list, this lock should be | |
80 | * taken as a RW_WRITER; and any time traversal is done over the list | |
81 | * (without modification to it) this lock should be taken as a RW_READER. | |
82 | */ | |
34dc7c2f | 83 | static krwlock_t inject_lock; |
26ef0cc7 TH |
84 | |
85 | /* | |
86 | * This holds the number of zinject delay handlers that have been | |
87 | * registered on the system. It is protected by the inject_lock defined | |
88 | * above. Thus modifications to this count must be a RW_WRITER of the | |
89 | * inject_lock, and reads of this count must be (at least) a RW_READER | |
90 | * of the lock. | |
91 | */ | |
92 | static int inject_delay_count = 0; | |
93 | ||
94 | /* | |
95 | * This lock is used only in zio_handle_io_delay(), refer to the comment | |
96 | * in that function for more details. | |
97 | */ | |
98 | static kmutex_t inject_delay_mtx; | |
99 | ||
100 | /* | |
101 | * Used to assign unique identifying numbers to each new zinject handler. | |
102 | */ | |
34dc7c2f BB |
103 | static int inject_next_id = 1; |
104 | ||
0241e491 DB |
105 | /* |
106 | * Test if the requested frequency was triggered | |
107 | */ | |
108 | static boolean_t | |
109 | freq_triggered(uint32_t frequency) | |
110 | { | |
111 | /* | |
112 | * zero implies always (100%) | |
113 | */ | |
114 | if (frequency == 0) | |
115 | return (B_TRUE); | |
116 | ||
117 | /* | |
e1cfd73f | 118 | * Note: we still handle legacy (unscaled) frequency values |
0241e491 DB |
119 | */ |
120 | uint32_t maximum = (frequency <= 100) ? 100 : ZI_PERCENTAGE_MAX; | |
121 | ||
29274c9f | 122 | return (random_in_range(maximum) < frequency); |
0241e491 DB |
123 | } |
124 | ||
34dc7c2f BB |
125 | /* |
126 | * Returns true if the given record matches the I/O in progress. | |
127 | */ | |
128 | static boolean_t | |
ab7615d9 | 129 | zio_match_handler(const zbookmark_phys_t *zb, uint64_t type, int dva, |
34dc7c2f BB |
130 | zinject_record_t *record, int error) |
131 | { | |
132 | /* | |
133 | * Check for a match against the MOS, which is based on type | |
134 | */ | |
428870ff BB |
135 | if (zb->zb_objset == DMU_META_OBJSET && |
136 | record->zi_objset == DMU_META_OBJSET && | |
137 | record->zi_object == DMU_META_DNODE_OBJECT) { | |
34dc7c2f BB |
138 | if (record->zi_type == DMU_OT_NONE || |
139 | type == record->zi_type) | |
0241e491 | 140 | return (freq_triggered(record->zi_freq)); |
34dc7c2f BB |
141 | else |
142 | return (B_FALSE); | |
143 | } | |
144 | ||
145 | /* | |
146 | * Check for an exact match. | |
147 | */ | |
148 | if (zb->zb_objset == record->zi_objset && | |
149 | zb->zb_object == record->zi_object && | |
150 | zb->zb_level == record->zi_level && | |
151 | zb->zb_blkid >= record->zi_start && | |
152 | zb->zb_blkid <= record->zi_end && | |
dad2b19f AZ |
153 | (record->zi_dvas == 0 || |
154 | (dva != ZI_NO_DVA && (record->zi_dvas & (1ULL << dva)))) && | |
ab7615d9 | 155 | error == record->zi_error) { |
0241e491 | 156 | return (freq_triggered(record->zi_freq)); |
ab7615d9 | 157 | } |
34dc7c2f BB |
158 | |
159 | return (B_FALSE); | |
160 | } | |
161 | ||
428870ff BB |
162 | /* |
163 | * Panic the system when a config change happens in the function | |
164 | * specified by tag. | |
165 | */ | |
166 | void | |
dd66857d | 167 | zio_handle_panic_injection(spa_t *spa, const char *tag, uint64_t type) |
428870ff BB |
168 | { |
169 | inject_handler_t *handler; | |
170 | ||
171 | rw_enter(&inject_lock, RW_READER); | |
172 | ||
173 | for (handler = list_head(&inject_handlers); handler != NULL; | |
174 | handler = list_next(&inject_handlers, handler)) { | |
175 | ||
176 | if (spa != handler->zi_spa) | |
177 | continue; | |
178 | ||
179 | if (handler->zi_record.zi_type == type && | |
180 | strcmp(tag, handler->zi_record.zi_func) == 0) | |
181 | panic("Panic requested in function %s\n", tag); | |
182 | } | |
183 | ||
184 | rw_exit(&inject_lock); | |
185 | } | |
186 | ||
be9a5c35 TC |
187 | /* |
188 | * Inject a decryption failure. Decryption failures can occur in | |
189 | * both the ARC and the ZIO layers. | |
190 | */ | |
191 | int | |
192 | zio_handle_decrypt_injection(spa_t *spa, const zbookmark_phys_t *zb, | |
193 | uint64_t type, int error) | |
194 | { | |
195 | int ret = 0; | |
196 | inject_handler_t *handler; | |
197 | ||
198 | rw_enter(&inject_lock, RW_READER); | |
199 | ||
200 | for (handler = list_head(&inject_handlers); handler != NULL; | |
201 | handler = list_next(&inject_handlers, handler)) { | |
202 | ||
203 | if (spa != handler->zi_spa || | |
204 | handler->zi_record.zi_cmd != ZINJECT_DECRYPT_FAULT) | |
205 | continue; | |
206 | ||
ab7615d9 TC |
207 | if (zio_match_handler(zb, type, ZI_NO_DVA, |
208 | &handler->zi_record, error)) { | |
be9a5c35 TC |
209 | ret = error; |
210 | break; | |
211 | } | |
212 | } | |
213 | ||
214 | rw_exit(&inject_lock); | |
215 | return (ret); | |
216 | } | |
217 | ||
ab7615d9 TC |
218 | /* |
219 | * If this is a physical I/O for a vdev child determine which DVA it is | |
220 | * for. We iterate backwards through the DVAs matching on the offset so | |
221 | * that we end up with ZI_NO_DVA (-1) if we don't find a match. | |
222 | */ | |
223 | static int | |
224 | zio_match_dva(zio_t *zio) | |
225 | { | |
226 | int i = ZI_NO_DVA; | |
227 | ||
228 | if (zio->io_bp != NULL && zio->io_vd != NULL && | |
229 | zio->io_child_type == ZIO_CHILD_VDEV) { | |
230 | for (i = BP_GET_NDVAS(zio->io_bp) - 1; i >= 0; i--) { | |
231 | dva_t *dva = &zio->io_bp->blk_dva[i]; | |
232 | uint64_t off = DVA_GET_OFFSET(dva); | |
233 | vdev_t *vd = vdev_lookup_top(zio->io_spa, | |
234 | DVA_GET_VDEV(dva)); | |
235 | ||
236 | /* Compensate for vdev label added to leaves */ | |
237 | if (zio->io_vd->vdev_ops->vdev_op_leaf) | |
238 | off += VDEV_LABEL_START_SIZE; | |
239 | ||
240 | if (zio->io_vd == vd && zio->io_offset == off) | |
241 | break; | |
242 | } | |
243 | } | |
244 | ||
245 | return (i); | |
246 | } | |
247 | ||
248 | ||
34dc7c2f BB |
249 | /* |
250 | * Determine if the I/O in question should return failure. Returns the errno | |
251 | * to be returned to the caller. | |
252 | */ | |
253 | int | |
254 | zio_handle_fault_injection(zio_t *zio, int error) | |
255 | { | |
256 | int ret = 0; | |
257 | inject_handler_t *handler; | |
258 | ||
259 | /* | |
260 | * Ignore I/O not associated with any logical data. | |
261 | */ | |
262 | if (zio->io_logical == NULL) | |
263 | return (0); | |
264 | ||
265 | /* | |
266 | * Currently, we only support fault injection on reads. | |
267 | */ | |
268 | if (zio->io_type != ZIO_TYPE_READ) | |
269 | return (0); | |
270 | ||
b2255edc BB |
271 | /* |
272 | * A rebuild I/O has no checksum to verify. | |
273 | */ | |
274 | if (zio->io_priority == ZIO_PRIORITY_REBUILD && error == ECKSUM) | |
275 | return (0); | |
276 | ||
34dc7c2f BB |
277 | rw_enter(&inject_lock, RW_READER); |
278 | ||
279 | for (handler = list_head(&inject_handlers); handler != NULL; | |
280 | handler = list_next(&inject_handlers, handler)) { | |
cc92e9d0 GW |
281 | if (zio->io_spa != handler->zi_spa || |
282 | handler->zi_record.zi_cmd != ZINJECT_DATA_FAULT) | |
34dc7c2f BB |
283 | continue; |
284 | ||
ab7615d9 | 285 | /* If this handler matches, return the specified error */ |
34dc7c2f BB |
286 | if (zio_match_handler(&zio->io_logical->io_bookmark, |
287 | zio->io_bp ? BP_GET_TYPE(zio->io_bp) : DMU_OT_NONE, | |
ab7615d9 | 288 | zio_match_dva(zio), &handler->zi_record, error)) { |
34dc7c2f BB |
289 | ret = error; |
290 | break; | |
291 | } | |
292 | } | |
293 | ||
294 | rw_exit(&inject_lock); | |
295 | ||
296 | return (ret); | |
297 | } | |
298 | ||
b128c09f BB |
299 | /* |
300 | * Determine if the zio is part of a label update and has an injection | |
301 | * handler associated with that portion of the label. Currently, we | |
302 | * allow error injection in either the nvlist or the uberblock region of | |
303 | * of the vdev label. | |
304 | */ | |
305 | int | |
306 | zio_handle_label_injection(zio_t *zio, int error) | |
307 | { | |
308 | inject_handler_t *handler; | |
309 | vdev_t *vd = zio->io_vd; | |
310 | uint64_t offset = zio->io_offset; | |
311 | int label; | |
312 | int ret = 0; | |
313 | ||
428870ff | 314 | if (offset >= VDEV_LABEL_START_SIZE && |
b128c09f BB |
315 | offset < vd->vdev_psize - VDEV_LABEL_END_SIZE) |
316 | return (0); | |
317 | ||
318 | rw_enter(&inject_lock, RW_READER); | |
319 | ||
320 | for (handler = list_head(&inject_handlers); handler != NULL; | |
321 | handler = list_next(&inject_handlers, handler)) { | |
322 | uint64_t start = handler->zi_record.zi_start; | |
323 | uint64_t end = handler->zi_record.zi_end; | |
324 | ||
cc92e9d0 | 325 | if (handler->zi_record.zi_cmd != ZINJECT_LABEL_FAULT) |
b128c09f BB |
326 | continue; |
327 | ||
328 | /* | |
329 | * The injection region is the relative offsets within a | |
330 | * vdev label. We must determine the label which is being | |
331 | * updated and adjust our region accordingly. | |
332 | */ | |
333 | label = vdev_label_number(vd->vdev_psize, offset); | |
334 | start = vdev_label_offset(vd->vdev_psize, label, start); | |
335 | end = vdev_label_offset(vd->vdev_psize, label, end); | |
336 | ||
337 | if (zio->io_vd->vdev_guid == handler->zi_record.zi_guid && | |
338 | (offset >= start && offset <= end)) { | |
339 | ret = error; | |
340 | break; | |
341 | } | |
342 | } | |
343 | rw_exit(&inject_lock); | |
344 | return (ret); | |
345 | } | |
346 | ||
d977122d DB |
347 | static int |
348 | zio_inject_bitflip_cb(void *data, size_t len, void *private) | |
349 | { | |
ef70eff1 | 350 | zio_t *zio = private; |
d977122d | 351 | uint8_t *buffer = data; |
29274c9f | 352 | uint_t byte = random_in_range(len); |
b128c09f | 353 | |
ef70eff1 | 354 | ASSERT3U(zio->io_type, ==, ZIO_TYPE_READ); |
d977122d DB |
355 | |
356 | /* flip a single random bit in an abd data buffer */ | |
29274c9f | 357 | buffer[byte] ^= 1 << random_in_range(8); |
d977122d DB |
358 | |
359 | return (1); /* stop after first flip */ | |
360 | } | |
361 | ||
362 | static int | |
363 | zio_handle_device_injection_impl(vdev_t *vd, zio_t *zio, int err1, int err2) | |
34dc7c2f BB |
364 | { |
365 | inject_handler_t *handler; | |
366 | int ret = 0; | |
367 | ||
428870ff | 368 | /* |
76d1dde9 RN |
369 | * We skip over faults in the labels unless it's during device open |
370 | * (i.e. zio == NULL) or a device flush (offset is meaningless) | |
428870ff | 371 | */ |
d7605ae7 | 372 | if (zio != NULL && zio->io_type != ZIO_TYPE_FLUSH) { |
428870ff BB |
373 | uint64_t offset = zio->io_offset; |
374 | ||
375 | if (offset < VDEV_LABEL_START_SIZE || | |
376 | offset >= vd->vdev_psize - VDEV_LABEL_END_SIZE) | |
377 | return (0); | |
378 | } | |
379 | ||
34dc7c2f BB |
380 | rw_enter(&inject_lock, RW_READER); |
381 | ||
382 | for (handler = list_head(&inject_handlers); handler != NULL; | |
383 | handler = list_next(&inject_handlers, handler)) { | |
384 | ||
cc92e9d0 | 385 | if (handler->zi_record.zi_cmd != ZINJECT_DEVICE_FAULT) |
b128c09f BB |
386 | continue; |
387 | ||
34dc7c2f | 388 | if (vd->vdev_guid == handler->zi_record.zi_guid) { |
9babb374 BB |
389 | if (handler->zi_record.zi_failfast && |
390 | (zio == NULL || (zio->io_flags & | |
391 | (ZIO_FLAG_IO_RETRY | ZIO_FLAG_TRYHARD)))) { | |
392 | continue; | |
393 | } | |
394 | ||
428870ff BB |
395 | /* Handle type specific I/O failures */ |
396 | if (zio != NULL && | |
397 | handler->zi_record.zi_iotype != ZIO_TYPES && | |
398 | handler->zi_record.zi_iotype != zio->io_type) | |
399 | continue; | |
400 | ||
d977122d DB |
401 | if (handler->zi_record.zi_error == err1 || |
402 | handler->zi_record.zi_error == err2) { | |
0241e491 DB |
403 | /* |
404 | * limit error injection if requested | |
405 | */ | |
406 | if (!freq_triggered(handler->zi_record.zi_freq)) | |
407 | continue; | |
408 | ||
34dc7c2f BB |
409 | /* |
410 | * For a failed open, pretend like the device | |
411 | * has gone away. | |
412 | */ | |
d977122d | 413 | if (err1 == ENXIO) |
34dc7c2f BB |
414 | vd->vdev_stat.vs_aux = |
415 | VDEV_AUX_OPEN_FAILED; | |
428870ff BB |
416 | |
417 | /* | |
418 | * Treat these errors as if they had been | |
419 | * retried so that all the appropriate stats | |
420 | * and FMA events are generated. | |
421 | */ | |
422 | if (!handler->zi_record.zi_failfast && | |
423 | zio != NULL) | |
424 | zio->io_flags |= ZIO_FLAG_IO_RETRY; | |
425 | ||
d977122d DB |
426 | /* |
427 | * EILSEQ means flip a bit after a read | |
428 | */ | |
429 | if (handler->zi_record.zi_error == EILSEQ) { | |
430 | if (zio == NULL) | |
431 | break; | |
432 | ||
433 | /* locate buffer data and flip a bit */ | |
434 | (void) abd_iterate_func(zio->io_abd, 0, | |
435 | zio->io_size, zio_inject_bitflip_cb, | |
436 | zio); | |
437 | break; | |
438 | } | |
439 | ||
440 | ret = handler->zi_record.zi_error; | |
34dc7c2f BB |
441 | break; |
442 | } | |
443 | if (handler->zi_record.zi_error == ENXIO) { | |
2e528b49 | 444 | ret = SET_ERROR(EIO); |
34dc7c2f BB |
445 | break; |
446 | } | |
447 | } | |
448 | } | |
449 | ||
450 | rw_exit(&inject_lock); | |
451 | ||
452 | return (ret); | |
453 | } | |
454 | ||
d977122d DB |
455 | int |
456 | zio_handle_device_injection(vdev_t *vd, zio_t *zio, int error) | |
457 | { | |
458 | return (zio_handle_device_injection_impl(vd, zio, error, INT_MAX)); | |
459 | } | |
460 | ||
461 | int | |
462 | zio_handle_device_injections(vdev_t *vd, zio_t *zio, int err1, int err2) | |
463 | { | |
464 | return (zio_handle_device_injection_impl(vd, zio, err1, err2)); | |
465 | } | |
466 | ||
428870ff BB |
467 | /* |
468 | * Simulate hardware that ignores cache flushes. For requested number | |
469 | * of seconds nix the actual writing to disk. | |
470 | */ | |
471 | void | |
472 | zio_handle_ignored_writes(zio_t *zio) | |
473 | { | |
474 | inject_handler_t *handler; | |
475 | ||
476 | rw_enter(&inject_lock, RW_READER); | |
477 | ||
478 | for (handler = list_head(&inject_handlers); handler != NULL; | |
479 | handler = list_next(&inject_handlers, handler)) { | |
480 | ||
481 | /* Ignore errors not destined for this pool */ | |
cc92e9d0 GW |
482 | if (zio->io_spa != handler->zi_spa || |
483 | handler->zi_record.zi_cmd != ZINJECT_IGNORED_WRITES) | |
428870ff BB |
484 | continue; |
485 | ||
486 | /* | |
487 | * Positive duration implies # of seconds, negative | |
488 | * a number of txgs | |
489 | */ | |
490 | if (handler->zi_record.zi_timer == 0) { | |
491 | if (handler->zi_record.zi_duration > 0) | |
492 | handler->zi_record.zi_timer = ddi_get_lbolt64(); | |
493 | else | |
494 | handler->zi_record.zi_timer = zio->io_txg; | |
495 | } | |
496 | ||
497 | /* Have a "problem" writing 60% of the time */ | |
29274c9f | 498 | if (random_in_range(100) < 60) |
428870ff BB |
499 | zio->io_pipeline &= ~ZIO_VDEV_IO_STAGES; |
500 | break; | |
501 | } | |
502 | ||
503 | rw_exit(&inject_lock); | |
504 | } | |
505 | ||
506 | void | |
507 | spa_handle_ignored_writes(spa_t *spa) | |
508 | { | |
509 | inject_handler_t *handler; | |
510 | ||
511 | if (zio_injection_enabled == 0) | |
512 | return; | |
513 | ||
514 | rw_enter(&inject_lock, RW_READER); | |
515 | ||
516 | for (handler = list_head(&inject_handlers); handler != NULL; | |
517 | handler = list_next(&inject_handlers, handler)) { | |
518 | ||
cc92e9d0 GW |
519 | if (spa != handler->zi_spa || |
520 | handler->zi_record.zi_cmd != ZINJECT_IGNORED_WRITES) | |
428870ff BB |
521 | continue; |
522 | ||
523 | if (handler->zi_record.zi_duration > 0) { | |
524 | VERIFY(handler->zi_record.zi_timer == 0 || | |
0b75bdb3 CC |
525 | ddi_time_after64( |
526 | (int64_t)handler->zi_record.zi_timer + | |
527 | handler->zi_record.zi_duration * hz, | |
528 | ddi_get_lbolt64())); | |
428870ff BB |
529 | } else { |
530 | /* duration is negative so the subtraction here adds */ | |
531 | VERIFY(handler->zi_record.zi_timer == 0 || | |
532 | handler->zi_record.zi_timer - | |
533 | handler->zi_record.zi_duration >= | |
534 | spa_syncing_txg(spa)); | |
535 | } | |
536 | } | |
537 | ||
538 | rw_exit(&inject_lock); | |
539 | } | |
540 | ||
26ef0cc7 | 541 | hrtime_t |
cc92e9d0 GW |
542 | zio_handle_io_delay(zio_t *zio) |
543 | { | |
544 | vdev_t *vd = zio->io_vd; | |
26ef0cc7 TH |
545 | inject_handler_t *min_handler = NULL; |
546 | hrtime_t min_target = 0; | |
cc92e9d0 GW |
547 | |
548 | rw_enter(&inject_lock, RW_READER); | |
549 | ||
26ef0cc7 TH |
550 | /* |
551 | * inject_delay_count is a subset of zio_injection_enabled that | |
552 | * is only incremented for delay handlers. These checks are | |
553 | * mainly added to remind the reader why we're not explicitly | |
554 | * checking zio_injection_enabled like the other functions. | |
555 | */ | |
556 | IMPLY(inject_delay_count > 0, zio_injection_enabled > 0); | |
557 | IMPLY(zio_injection_enabled == 0, inject_delay_count == 0); | |
558 | ||
559 | /* | |
560 | * If there aren't any inject delay handlers registered, then we | |
561 | * can short circuit and simply return 0 here. A value of zero | |
562 | * informs zio_delay_interrupt() that this request should not be | |
563 | * delayed. This short circuit keeps us from acquiring the | |
564 | * inject_delay_mutex unnecessarily. | |
565 | */ | |
566 | if (inject_delay_count == 0) { | |
567 | rw_exit(&inject_lock); | |
568 | return (0); | |
569 | } | |
570 | ||
571 | /* | |
572 | * Each inject handler has a number of "lanes" associated with | |
573 | * it. Each lane is able to handle requests independently of one | |
574 | * another, and at a latency defined by the inject handler | |
575 | * record's zi_timer field. Thus if a handler in configured with | |
576 | * a single lane with a 10ms latency, it will delay requests | |
577 | * such that only a single request is completed every 10ms. So, | |
578 | * if more than one request is attempted per each 10ms interval, | |
579 | * the average latency of the requests will be greater than | |
580 | * 10ms; but if only a single request is submitted each 10ms | |
581 | * interval the average latency will be 10ms. | |
582 | * | |
583 | * We need to acquire this mutex to prevent multiple concurrent | |
584 | * threads being assigned to the same lane of a given inject | |
585 | * handler. The mutex allows us to perform the following two | |
586 | * operations atomically: | |
587 | * | |
588 | * 1. determine the minimum handler and minimum target | |
589 | * value of all the possible handlers | |
590 | * 2. update that minimum handler's lane array | |
591 | * | |
592 | * Without atomicity, two (or more) threads could pick the same | |
593 | * lane in step (1), and then conflict with each other in step | |
594 | * (2). This could allow a single lane handler to process | |
595 | * multiple requests simultaneously, which shouldn't be possible. | |
596 | */ | |
597 | mutex_enter(&inject_delay_mtx); | |
cc92e9d0 | 598 | |
1c27024e | 599 | for (inject_handler_t *handler = list_head(&inject_handlers); |
26ef0cc7 | 600 | handler != NULL; handler = list_next(&inject_handlers, handler)) { |
cc92e9d0 GW |
601 | if (handler->zi_record.zi_cmd != ZINJECT_DELAY_IO) |
602 | continue; | |
603 | ||
0241e491 | 604 | if (!freq_triggered(handler->zi_record.zi_freq)) |
c35b1882 BB |
605 | continue; |
606 | ||
26ef0cc7 TH |
607 | if (vd->vdev_guid != handler->zi_record.zi_guid) |
608 | continue; | |
609 | ||
c3f2f1aa | 610 | /* also match on I/O type (e.g., -T read) */ |
cbe88229 | 611 | if (handler->zi_record.zi_iotype != ZIO_TYPES && |
c3f2f1aa DB |
612 | handler->zi_record.zi_iotype != zio->io_type) { |
613 | continue; | |
614 | } | |
cbe88229 | 615 | |
26ef0cc7 TH |
616 | /* |
617 | * Defensive; should never happen as the array allocation | |
618 | * occurs prior to inserting this handler on the list. | |
619 | */ | |
620 | ASSERT3P(handler->zi_lanes, !=, NULL); | |
621 | ||
622 | /* | |
623 | * This should never happen, the zinject command should | |
624 | * prevent a user from setting an IO delay with zero lanes. | |
625 | */ | |
626 | ASSERT3U(handler->zi_record.zi_nlanes, !=, 0); | |
627 | ||
628 | ASSERT3U(handler->zi_record.zi_nlanes, >, | |
629 | handler->zi_next_lane); | |
630 | ||
631 | /* | |
632 | * We want to issue this IO to the lane that will become | |
633 | * idle the soonest, so we compare the soonest this | |
634 | * specific handler can complete the IO with all other | |
635 | * handlers, to find the lowest value of all possible | |
636 | * lanes. We then use this lane to submit the request. | |
637 | * | |
638 | * Since each handler has a constant value for its | |
639 | * delay, we can just use the "next" lane for that | |
640 | * handler; as it will always be the lane with the | |
641 | * lowest value for that particular handler (i.e. the | |
642 | * lane that will become idle the soonest). This saves a | |
643 | * scan of each handler's lanes array. | |
644 | * | |
645 | * There's two cases to consider when determining when | |
646 | * this specific IO request should complete. If this | |
647 | * lane is idle, we want to "submit" the request now so | |
648 | * it will complete after zi_timer milliseconds. Thus, | |
649 | * we set the target to now + zi_timer. | |
650 | * | |
651 | * If the lane is busy, we want this request to complete | |
652 | * zi_timer milliseconds after the lane becomes idle. | |
653 | * Since the 'zi_lanes' array holds the time at which | |
654 | * each lane will become idle, we use that value to | |
655 | * determine when this request should complete. | |
656 | */ | |
1c27024e DB |
657 | hrtime_t idle = handler->zi_record.zi_timer + gethrtime(); |
658 | hrtime_t busy = handler->zi_record.zi_timer + | |
26ef0cc7 | 659 | handler->zi_lanes[handler->zi_next_lane]; |
1c27024e | 660 | hrtime_t target = MAX(idle, busy); |
26ef0cc7 TH |
661 | |
662 | if (min_handler == NULL) { | |
663 | min_handler = handler; | |
664 | min_target = target; | |
665 | continue; | |
cc92e9d0 GW |
666 | } |
667 | ||
26ef0cc7 TH |
668 | ASSERT3P(min_handler, !=, NULL); |
669 | ASSERT3U(min_target, !=, 0); | |
670 | ||
671 | /* | |
672 | * We don't yet increment the "next lane" variable since | |
673 | * we still might find a lower value lane in another | |
674 | * handler during any remaining iterations. Once we're | |
675 | * sure we've selected the absolute minimum, we'll claim | |
676 | * the lane and increment the handler's "next lane" | |
677 | * field below. | |
678 | */ | |
679 | ||
680 | if (target < min_target) { | |
681 | min_handler = handler; | |
682 | min_target = target; | |
683 | } | |
cc92e9d0 | 684 | } |
26ef0cc7 TH |
685 | |
686 | /* | |
687 | * 'min_handler' will be NULL if no IO delays are registered for | |
688 | * this vdev, otherwise it will point to the handler containing | |
689 | * the lane that will become idle the soonest. | |
690 | */ | |
691 | if (min_handler != NULL) { | |
692 | ASSERT3U(min_target, !=, 0); | |
693 | min_handler->zi_lanes[min_handler->zi_next_lane] = min_target; | |
694 | ||
695 | /* | |
696 | * If we've used all possible lanes for this handler, | |
697 | * loop back and start using the first lane again; | |
698 | * otherwise, just increment the lane index. | |
699 | */ | |
700 | min_handler->zi_next_lane = (min_handler->zi_next_lane + 1) % | |
701 | min_handler->zi_record.zi_nlanes; | |
702 | } | |
703 | ||
704 | mutex_exit(&inject_delay_mtx); | |
cc92e9d0 | 705 | rw_exit(&inject_lock); |
26ef0cc7 TH |
706 | |
707 | return (min_target); | |
cc92e9d0 GW |
708 | } |
709 | ||
c183d164 GW |
710 | static void |
711 | zio_handle_pool_delay(spa_t *spa, hrtime_t elapsed, zinject_type_t command) | |
712 | { | |
713 | inject_handler_t *handler; | |
714 | hrtime_t delay = 0; | |
715 | int id = 0; | |
716 | ||
717 | rw_enter(&inject_lock, RW_READER); | |
718 | ||
719 | for (handler = list_head(&inject_handlers); | |
720 | handler != NULL && handler->zi_record.zi_cmd == command; | |
721 | handler = list_next(&inject_handlers, handler)) { | |
722 | ASSERT3P(handler->zi_spa_name, !=, NULL); | |
723 | if (strcmp(spa_name(spa), handler->zi_spa_name) == 0) { | |
724 | uint64_t pause = | |
725 | SEC2NSEC(handler->zi_record.zi_duration); | |
726 | if (pause > elapsed) { | |
727 | delay = pause - elapsed; | |
728 | } | |
729 | id = handler->zi_id; | |
730 | break; | |
731 | } | |
732 | } | |
733 | ||
734 | rw_exit(&inject_lock); | |
735 | ||
736 | if (delay) { | |
737 | if (command == ZINJECT_DELAY_IMPORT) { | |
738 | spa_import_progress_set_notes(spa, "injecting %llu " | |
739 | "sec delay", (u_longlong_t)NSEC2SEC(delay)); | |
740 | } | |
741 | zfs_sleep_until(gethrtime() + delay); | |
742 | } | |
743 | if (id) { | |
744 | /* all done with this one-shot handler */ | |
745 | zio_clear_fault(id); | |
746 | } | |
747 | } | |
748 | ||
749 | /* | |
750 | * For testing, inject a delay during an import | |
751 | */ | |
752 | void | |
753 | zio_handle_import_delay(spa_t *spa, hrtime_t elapsed) | |
754 | { | |
755 | zio_handle_pool_delay(spa, elapsed, ZINJECT_DELAY_IMPORT); | |
756 | } | |
757 | ||
758 | /* | |
759 | * For testing, inject a delay during an export | |
760 | */ | |
761 | void | |
762 | zio_handle_export_delay(spa_t *spa, hrtime_t elapsed) | |
763 | { | |
764 | zio_handle_pool_delay(spa, elapsed, ZINJECT_DELAY_EXPORT); | |
765 | } | |
766 | ||
e89f1295 DB |
767 | static int |
768 | zio_calculate_range(const char *pool, zinject_record_t *record) | |
769 | { | |
770 | dsl_pool_t *dp; | |
771 | dsl_dataset_t *ds; | |
772 | objset_t *os = NULL; | |
773 | dnode_t *dn = NULL; | |
774 | int error; | |
775 | ||
776 | /* | |
777 | * Obtain the dnode for object using pool, objset, and object | |
778 | */ | |
779 | error = dsl_pool_hold(pool, FTAG, &dp); | |
780 | if (error) | |
781 | return (error); | |
782 | ||
783 | error = dsl_dataset_hold_obj(dp, record->zi_objset, FTAG, &ds); | |
784 | dsl_pool_rele(dp, FTAG); | |
785 | if (error) | |
786 | return (error); | |
787 | ||
788 | error = dmu_objset_from_ds(ds, &os); | |
789 | dsl_dataset_rele(ds, FTAG); | |
790 | if (error) | |
791 | return (error); | |
792 | ||
793 | error = dnode_hold(os, record->zi_object, FTAG, &dn); | |
794 | if (error) | |
795 | return (error); | |
796 | ||
797 | /* | |
798 | * Translate the range into block IDs | |
799 | */ | |
800 | if (record->zi_start != 0 || record->zi_end != -1ULL) { | |
801 | record->zi_start >>= dn->dn_datablkshift; | |
802 | record->zi_end >>= dn->dn_datablkshift; | |
803 | } | |
804 | if (record->zi_level > 0) { | |
805 | if (record->zi_level >= dn->dn_nlevels) { | |
806 | dnode_rele(dn, FTAG); | |
807 | return (SET_ERROR(EDOM)); | |
808 | } | |
809 | ||
810 | if (record->zi_start != 0 || record->zi_end != 0) { | |
811 | int shift = dn->dn_indblkshift - SPA_BLKPTRSHIFT; | |
812 | ||
813 | for (int level = record->zi_level; level > 0; level--) { | |
814 | record->zi_start >>= shift; | |
815 | record->zi_end >>= shift; | |
816 | } | |
817 | } | |
818 | } | |
819 | ||
820 | dnode_rele(dn, FTAG); | |
821 | return (0); | |
822 | } | |
823 | ||
c183d164 GW |
824 | static boolean_t |
825 | zio_pool_handler_exists(const char *name, zinject_type_t command) | |
826 | { | |
827 | boolean_t exists = B_FALSE; | |
828 | ||
829 | rw_enter(&inject_lock, RW_READER); | |
830 | for (inject_handler_t *handler = list_head(&inject_handlers); | |
831 | handler != NULL; handler = list_next(&inject_handlers, handler)) { | |
832 | if (command != handler->zi_record.zi_cmd) | |
833 | continue; | |
834 | ||
835 | const char *pool = (handler->zi_spa_name != NULL) ? | |
836 | handler->zi_spa_name : spa_name(handler->zi_spa); | |
837 | if (strcmp(name, pool) == 0) { | |
838 | exists = B_TRUE; | |
839 | break; | |
840 | } | |
841 | } | |
842 | rw_exit(&inject_lock); | |
843 | ||
844 | return (exists); | |
845 | } | |
34dc7c2f BB |
846 | /* |
847 | * Create a new handler for the given record. We add it to the list, adding | |
848 | * a reference to the spa_t in the process. We increment zio_injection_enabled, | |
849 | * which is the switch to trigger all fault injection. | |
850 | */ | |
851 | int | |
852 | zio_inject_fault(char *name, int flags, int *id, zinject_record_t *record) | |
853 | { | |
854 | inject_handler_t *handler; | |
855 | int error; | |
856 | spa_t *spa; | |
857 | ||
858 | /* | |
859 | * If this is pool-wide metadata, make sure we unload the corresponding | |
860 | * spa_t, so that the next attempt to load it will trigger the fault. | |
861 | * We call spa_reset() to unload the pool appropriately. | |
862 | */ | |
863 | if (flags & ZINJECT_UNLOAD_SPA) | |
864 | if ((error = spa_reset(name)) != 0) | |
865 | return (error); | |
866 | ||
26ef0cc7 TH |
867 | if (record->zi_cmd == ZINJECT_DELAY_IO) { |
868 | /* | |
869 | * A value of zero for the number of lanes or for the | |
870 | * delay time doesn't make sense. | |
871 | */ | |
872 | if (record->zi_timer == 0 || record->zi_nlanes == 0) | |
873 | return (SET_ERROR(EINVAL)); | |
874 | ||
875 | /* | |
876 | * The number of lanes is directly mapped to the size of | |
877 | * an array used by the handler. Thus, to ensure the | |
878 | * user doesn't trigger an allocation that's "too large" | |
879 | * we cap the number of lanes here. | |
880 | */ | |
881 | if (record->zi_nlanes >= UINT16_MAX) | |
882 | return (SET_ERROR(EINVAL)); | |
883 | } | |
884 | ||
e89f1295 DB |
885 | /* |
886 | * If the supplied range was in bytes -- calculate the actual blkid | |
887 | */ | |
888 | if (flags & ZINJECT_CALC_RANGE) { | |
889 | error = zio_calculate_range(name, record); | |
890 | if (error != 0) | |
891 | return (error); | |
892 | } | |
893 | ||
34dc7c2f BB |
894 | if (!(flags & ZINJECT_NULL)) { |
895 | /* | |
c183d164 GW |
896 | * Pool delays for import or export don't take an |
897 | * injection reference on the spa. Instead they | |
898 | * rely on matching by name. | |
34dc7c2f | 899 | */ |
c183d164 GW |
900 | if (record->zi_cmd == ZINJECT_DELAY_IMPORT || |
901 | record->zi_cmd == ZINJECT_DELAY_EXPORT) { | |
902 | if (record->zi_duration <= 0) | |
903 | return (SET_ERROR(EINVAL)); | |
904 | /* | |
905 | * Only one import | export delay handler per pool. | |
906 | */ | |
907 | if (zio_pool_handler_exists(name, record->zi_cmd)) | |
908 | return (SET_ERROR(EEXIST)); | |
909 | ||
910 | mutex_enter(&spa_namespace_lock); | |
911 | boolean_t has_spa = spa_lookup(name) != NULL; | |
912 | mutex_exit(&spa_namespace_lock); | |
913 | ||
914 | if (record->zi_cmd == ZINJECT_DELAY_IMPORT && has_spa) | |
915 | return (SET_ERROR(EEXIST)); | |
916 | if (record->zi_cmd == ZINJECT_DELAY_EXPORT && !has_spa) | |
917 | return (SET_ERROR(ENOENT)); | |
918 | spa = NULL; | |
919 | } else { | |
920 | /* | |
921 | * spa_inject_ref() will add an injection reference, | |
922 | * which will prevent the pool from being removed | |
923 | * from the namespace while still allowing it to be | |
924 | * unloaded. | |
925 | */ | |
926 | if ((spa = spa_inject_addref(name)) == NULL) | |
927 | return (SET_ERROR(ENOENT)); | |
928 | } | |
34dc7c2f BB |
929 | |
930 | handler = kmem_alloc(sizeof (inject_handler_t), KM_SLEEP); | |
c183d164 | 931 | handler->zi_spa = spa; /* note: can be NULL */ |
26ef0cc7 TH |
932 | handler->zi_record = *record; |
933 | ||
934 | if (handler->zi_record.zi_cmd == ZINJECT_DELAY_IO) { | |
935 | handler->zi_lanes = kmem_zalloc( | |
936 | sizeof (*handler->zi_lanes) * | |
937 | handler->zi_record.zi_nlanes, KM_SLEEP); | |
938 | handler->zi_next_lane = 0; | |
939 | } else { | |
940 | handler->zi_lanes = NULL; | |
941 | handler->zi_next_lane = 0; | |
942 | } | |
943 | ||
c183d164 GW |
944 | if (handler->zi_spa == NULL) |
945 | handler->zi_spa_name = spa_strdup(name); | |
946 | else | |
947 | handler->zi_spa_name = NULL; | |
948 | ||
34dc7c2f BB |
949 | rw_enter(&inject_lock, RW_WRITER); |
950 | ||
26ef0cc7 TH |
951 | /* |
952 | * We can't move this increment into the conditional | |
953 | * above because we need to hold the RW_WRITER lock of | |
954 | * inject_lock, and we don't want to hold that while | |
955 | * allocating the handler's zi_lanes array. | |
956 | */ | |
957 | if (handler->zi_record.zi_cmd == ZINJECT_DELAY_IO) { | |
958 | ASSERT3S(inject_delay_count, >=, 0); | |
959 | inject_delay_count++; | |
960 | ASSERT3S(inject_delay_count, >, 0); | |
961 | } | |
962 | ||
34dc7c2f | 963 | *id = handler->zi_id = inject_next_id++; |
34dc7c2f | 964 | list_insert_tail(&inject_handlers, handler); |
bc89ac84 | 965 | atomic_inc_32(&zio_injection_enabled); |
34dc7c2f BB |
966 | |
967 | rw_exit(&inject_lock); | |
968 | } | |
969 | ||
970 | /* | |
971 | * Flush the ARC, so that any attempts to read this data will end up | |
972 | * going to the ZIO layer. Note that this is a little overkill, but | |
973 | * we don't have the necessary ARC interfaces to do anything else, and | |
974 | * fault injection isn't a performance critical path. | |
975 | */ | |
976 | if (flags & ZINJECT_FLUSH_ARC) | |
ca0bf58d PS |
977 | /* |
978 | * We must use FALSE to ensure arc_flush returns, since | |
979 | * we're not preventing concurrent ARC insertions. | |
980 | */ | |
981 | arc_flush(NULL, FALSE); | |
34dc7c2f BB |
982 | |
983 | return (0); | |
984 | } | |
985 | ||
986 | /* | |
987 | * Returns the next record with an ID greater than that supplied to the | |
988 | * function. Used to iterate over all handlers in the system. | |
989 | */ | |
990 | int | |
991 | zio_inject_list_next(int *id, char *name, size_t buflen, | |
992 | zinject_record_t *record) | |
993 | { | |
994 | inject_handler_t *handler; | |
995 | int ret; | |
996 | ||
997 | mutex_enter(&spa_namespace_lock); | |
998 | rw_enter(&inject_lock, RW_READER); | |
999 | ||
1000 | for (handler = list_head(&inject_handlers); handler != NULL; | |
1001 | handler = list_next(&inject_handlers, handler)) | |
1002 | if (handler->zi_id > *id) | |
1003 | break; | |
1004 | ||
1005 | if (handler) { | |
1006 | *record = handler->zi_record; | |
1007 | *id = handler->zi_id; | |
c183d164 GW |
1008 | ASSERT(handler->zi_spa || handler->zi_spa_name); |
1009 | if (handler->zi_spa != NULL) | |
1010 | (void) strlcpy(name, spa_name(handler->zi_spa), buflen); | |
1011 | else | |
1012 | (void) strlcpy(name, handler->zi_spa_name, buflen); | |
34dc7c2f BB |
1013 | ret = 0; |
1014 | } else { | |
2e528b49 | 1015 | ret = SET_ERROR(ENOENT); |
34dc7c2f BB |
1016 | } |
1017 | ||
1018 | rw_exit(&inject_lock); | |
1019 | mutex_exit(&spa_namespace_lock); | |
1020 | ||
1021 | return (ret); | |
1022 | } | |
1023 | ||
1024 | /* | |
1025 | * Clear the fault handler with the given identifier, or return ENOENT if none | |
1026 | * exists. | |
1027 | */ | |
1028 | int | |
1029 | zio_clear_fault(int id) | |
1030 | { | |
1031 | inject_handler_t *handler; | |
34dc7c2f BB |
1032 | |
1033 | rw_enter(&inject_lock, RW_WRITER); | |
1034 | ||
1035 | for (handler = list_head(&inject_handlers); handler != NULL; | |
1036 | handler = list_next(&inject_handlers, handler)) | |
1037 | if (handler->zi_id == id) | |
1038 | break; | |
1039 | ||
1040 | if (handler == NULL) { | |
572e2857 | 1041 | rw_exit(&inject_lock); |
2e528b49 | 1042 | return (SET_ERROR(ENOENT)); |
34dc7c2f BB |
1043 | } |
1044 | ||
26ef0cc7 TH |
1045 | if (handler->zi_record.zi_cmd == ZINJECT_DELAY_IO) { |
1046 | ASSERT3S(inject_delay_count, >, 0); | |
1047 | inject_delay_count--; | |
1048 | ASSERT3S(inject_delay_count, >=, 0); | |
1049 | } | |
1050 | ||
572e2857 | 1051 | list_remove(&inject_handlers, handler); |
34dc7c2f BB |
1052 | rw_exit(&inject_lock); |
1053 | ||
26ef0cc7 TH |
1054 | if (handler->zi_record.zi_cmd == ZINJECT_DELAY_IO) { |
1055 | ASSERT3P(handler->zi_lanes, !=, NULL); | |
1056 | kmem_free(handler->zi_lanes, sizeof (*handler->zi_lanes) * | |
1057 | handler->zi_record.zi_nlanes); | |
1058 | } else { | |
1059 | ASSERT3P(handler->zi_lanes, ==, NULL); | |
1060 | } | |
1061 | ||
c183d164 GW |
1062 | if (handler->zi_spa_name != NULL) |
1063 | spa_strfree(handler->zi_spa_name); | |
1064 | ||
1065 | if (handler->zi_spa != NULL) | |
1066 | spa_inject_delref(handler->zi_spa); | |
572e2857 | 1067 | kmem_free(handler, sizeof (inject_handler_t)); |
bc89ac84 | 1068 | atomic_dec_32(&zio_injection_enabled); |
572e2857 BB |
1069 | |
1070 | return (0); | |
34dc7c2f BB |
1071 | } |
1072 | ||
1073 | void | |
1074 | zio_inject_init(void) | |
1075 | { | |
b128c09f | 1076 | rw_init(&inject_lock, NULL, RW_DEFAULT, NULL); |
26ef0cc7 | 1077 | mutex_init(&inject_delay_mtx, NULL, MUTEX_DEFAULT, NULL); |
34dc7c2f BB |
1078 | list_create(&inject_handlers, sizeof (inject_handler_t), |
1079 | offsetof(inject_handler_t, zi_link)); | |
1080 | } | |
1081 | ||
1082 | void | |
1083 | zio_inject_fini(void) | |
1084 | { | |
1085 | list_destroy(&inject_handlers); | |
26ef0cc7 | 1086 | mutex_destroy(&inject_delay_mtx); |
b128c09f | 1087 | rw_destroy(&inject_lock); |
34dc7c2f | 1088 | } |
c409e464 | 1089 | |
93ce2b4c | 1090 | #if defined(_KERNEL) |
e89bd697 IH |
1091 | EXPORT_SYMBOL(zio_injection_enabled); |
1092 | EXPORT_SYMBOL(zio_inject_fault); | |
1093 | EXPORT_SYMBOL(zio_inject_list_next); | |
1094 | EXPORT_SYMBOL(zio_clear_fault); | |
1095 | EXPORT_SYMBOL(zio_handle_fault_injection); | |
1096 | EXPORT_SYMBOL(zio_handle_device_injection); | |
1097 | EXPORT_SYMBOL(zio_handle_label_injection); | |
c409e464 | 1098 | #endif |