]>
Commit | Line | Data |
---|---|---|
34dc7c2f BB |
1 | /* |
2 | * CDDL HEADER START | |
3 | * | |
4 | * The contents of this file are subject to the terms of the | |
5 | * Common Development and Distribution License (the "License"). | |
6 | * You may not use this file except in compliance with the License. | |
7 | * | |
8 | * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE | |
9 | * or http://www.opensolaris.org/os/licensing. | |
10 | * See the License for the specific language governing permissions | |
11 | * and limitations under the License. | |
12 | * | |
13 | * When distributing Covered Code, include this CDDL HEADER in each | |
14 | * file and include the License file at usr/src/OPENSOLARIS.LICENSE. | |
15 | * If applicable, add the following below this CDDL HEADER, with the | |
16 | * fields enclosed by brackets "[]" replaced with your own identifying | |
17 | * information: Portions Copyright [yyyy] [name of copyright owner] | |
18 | * | |
19 | * CDDL HEADER END | |
20 | */ | |
21 | /* | |
428870ff | 22 | * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. |
26ef0cc7 | 23 | * Copyright (c) 2012, 2015 by Delphix. All rights reserved. |
0241e491 | 24 | * Copyright (c) 2017, Intel Corporation. |
34dc7c2f BB |
25 | */ |
26 | ||
34dc7c2f BB |
27 | /* |
28 | * ZFS fault injection | |
29 | * | |
30 | * To handle fault injection, we keep track of a series of zinject_record_t | |
31 | * structures which describe which logical block(s) should be injected with a | |
32 | * fault. These are kept in a global list. Each record corresponds to a given | |
33 | * spa_t and maintains a special hold on the spa_t so that it cannot be deleted | |
34 | * or exported while the injection record exists. | |
35 | * | |
36 | * Device level injection is done using the 'zi_guid' field. If this is set, it | |
37 | * means that the error is destined for a particular device, not a piece of | |
38 | * data. | |
39 | * | |
40 | * This is a rather poor data structure and algorithm, but we don't expect more | |
41 | * than a few faults at any one time, so it should be sufficient for our needs. | |
42 | */ | |
43 | ||
44 | #include <sys/arc.h> | |
6a8fd57f | 45 | #include <sys/zio.h> |
34dc7c2f | 46 | #include <sys/zfs_ioctl.h> |
34dc7c2f | 47 | #include <sys/vdev_impl.h> |
428870ff | 48 | #include <sys/dmu_objset.h> |
e89f1295 | 49 | #include <sys/dsl_dataset.h> |
b128c09f | 50 | #include <sys/fs/zfs.h> |
34dc7c2f | 51 | |
c409e464 | 52 | uint32_t zio_injection_enabled = 0; |
34dc7c2f | 53 | |
26ef0cc7 TH |
54 | /* |
55 | * Data describing each zinject handler registered on the system, and | |
56 | * contains the list node linking the handler in the global zinject | |
57 | * handler list. | |
58 | */ | |
34dc7c2f BB |
59 | typedef struct inject_handler { |
60 | int zi_id; | |
61 | spa_t *zi_spa; | |
62 | zinject_record_t zi_record; | |
26ef0cc7 TH |
63 | uint64_t *zi_lanes; |
64 | int zi_next_lane; | |
34dc7c2f BB |
65 | list_node_t zi_link; |
66 | } inject_handler_t; | |
67 | ||
26ef0cc7 TH |
68 | /* |
69 | * List of all zinject handlers registered on the system, protected by | |
70 | * the inject_lock defined below. | |
71 | */ | |
34dc7c2f | 72 | static list_t inject_handlers; |
26ef0cc7 TH |
73 | |
74 | /* | |
75 | * This protects insertion into, and traversal of, the inject handler | |
76 | * list defined above; as well as the inject_delay_count. Any time a | |
77 | * handler is inserted or removed from the list, this lock should be | |
78 | * taken as a RW_WRITER; and any time traversal is done over the list | |
79 | * (without modification to it) this lock should be taken as a RW_READER. | |
80 | */ | |
34dc7c2f | 81 | static krwlock_t inject_lock; |
26ef0cc7 TH |
82 | |
83 | /* | |
84 | * This holds the number of zinject delay handlers that have been | |
85 | * registered on the system. It is protected by the inject_lock defined | |
86 | * above. Thus modifications to this count must be a RW_WRITER of the | |
87 | * inject_lock, and reads of this count must be (at least) a RW_READER | |
88 | * of the lock. | |
89 | */ | |
90 | static int inject_delay_count = 0; | |
91 | ||
92 | /* | |
93 | * This lock is used only in zio_handle_io_delay(), refer to the comment | |
94 | * in that function for more details. | |
95 | */ | |
96 | static kmutex_t inject_delay_mtx; | |
97 | ||
98 | /* | |
99 | * Used to assign unique identifying numbers to each new zinject handler. | |
100 | */ | |
34dc7c2f BB |
101 | static int inject_next_id = 1; |
102 | ||
0241e491 DB |
103 | /* |
104 | * Test if the requested frequency was triggered | |
105 | */ | |
106 | static boolean_t | |
107 | freq_triggered(uint32_t frequency) | |
108 | { | |
109 | /* | |
110 | * zero implies always (100%) | |
111 | */ | |
112 | if (frequency == 0) | |
113 | return (B_TRUE); | |
114 | ||
115 | /* | |
116 | * Note: we still handle legacy (unscaled) frequecy values | |
117 | */ | |
118 | uint32_t maximum = (frequency <= 100) ? 100 : ZI_PERCENTAGE_MAX; | |
119 | ||
120 | return (spa_get_random(maximum) < frequency); | |
121 | } | |
122 | ||
34dc7c2f BB |
123 | /* |
124 | * Returns true if the given record matches the I/O in progress. | |
125 | */ | |
126 | static boolean_t | |
ab7615d9 | 127 | zio_match_handler(const zbookmark_phys_t *zb, uint64_t type, int dva, |
34dc7c2f BB |
128 | zinject_record_t *record, int error) |
129 | { | |
130 | /* | |
131 | * Check for a match against the MOS, which is based on type | |
132 | */ | |
428870ff BB |
133 | if (zb->zb_objset == DMU_META_OBJSET && |
134 | record->zi_objset == DMU_META_OBJSET && | |
135 | record->zi_object == DMU_META_DNODE_OBJECT) { | |
34dc7c2f BB |
136 | if (record->zi_type == DMU_OT_NONE || |
137 | type == record->zi_type) | |
0241e491 | 138 | return (freq_triggered(record->zi_freq)); |
34dc7c2f BB |
139 | else |
140 | return (B_FALSE); | |
141 | } | |
142 | ||
143 | /* | |
144 | * Check for an exact match. | |
145 | */ | |
146 | if (zb->zb_objset == record->zi_objset && | |
147 | zb->zb_object == record->zi_object && | |
148 | zb->zb_level == record->zi_level && | |
149 | zb->zb_blkid >= record->zi_start && | |
150 | zb->zb_blkid <= record->zi_end && | |
ab7615d9 TC |
151 | (record->zi_dvas == 0 || (record->zi_dvas & (1ULL << dva))) && |
152 | error == record->zi_error) { | |
0241e491 | 153 | return (freq_triggered(record->zi_freq)); |
ab7615d9 | 154 | } |
34dc7c2f BB |
155 | |
156 | return (B_FALSE); | |
157 | } | |
158 | ||
428870ff BB |
159 | /* |
160 | * Panic the system when a config change happens in the function | |
161 | * specified by tag. | |
162 | */ | |
163 | void | |
164 | zio_handle_panic_injection(spa_t *spa, char *tag, uint64_t type) | |
165 | { | |
166 | inject_handler_t *handler; | |
167 | ||
168 | rw_enter(&inject_lock, RW_READER); | |
169 | ||
170 | for (handler = list_head(&inject_handlers); handler != NULL; | |
171 | handler = list_next(&inject_handlers, handler)) { | |
172 | ||
173 | if (spa != handler->zi_spa) | |
174 | continue; | |
175 | ||
176 | if (handler->zi_record.zi_type == type && | |
177 | strcmp(tag, handler->zi_record.zi_func) == 0) | |
178 | panic("Panic requested in function %s\n", tag); | |
179 | } | |
180 | ||
181 | rw_exit(&inject_lock); | |
182 | } | |
183 | ||
be9a5c35 TC |
184 | /* |
185 | * Inject a decryption failure. Decryption failures can occur in | |
186 | * both the ARC and the ZIO layers. | |
187 | */ | |
188 | int | |
189 | zio_handle_decrypt_injection(spa_t *spa, const zbookmark_phys_t *zb, | |
190 | uint64_t type, int error) | |
191 | { | |
192 | int ret = 0; | |
193 | inject_handler_t *handler; | |
194 | ||
195 | rw_enter(&inject_lock, RW_READER); | |
196 | ||
197 | for (handler = list_head(&inject_handlers); handler != NULL; | |
198 | handler = list_next(&inject_handlers, handler)) { | |
199 | ||
200 | if (spa != handler->zi_spa || | |
201 | handler->zi_record.zi_cmd != ZINJECT_DECRYPT_FAULT) | |
202 | continue; | |
203 | ||
ab7615d9 TC |
204 | if (zio_match_handler(zb, type, ZI_NO_DVA, |
205 | &handler->zi_record, error)) { | |
be9a5c35 TC |
206 | ret = error; |
207 | break; | |
208 | } | |
209 | } | |
210 | ||
211 | rw_exit(&inject_lock); | |
212 | return (ret); | |
213 | } | |
214 | ||
ab7615d9 TC |
215 | /* |
216 | * If this is a physical I/O for a vdev child determine which DVA it is | |
217 | * for. We iterate backwards through the DVAs matching on the offset so | |
218 | * that we end up with ZI_NO_DVA (-1) if we don't find a match. | |
219 | */ | |
220 | static int | |
221 | zio_match_dva(zio_t *zio) | |
222 | { | |
223 | int i = ZI_NO_DVA; | |
224 | ||
225 | if (zio->io_bp != NULL && zio->io_vd != NULL && | |
226 | zio->io_child_type == ZIO_CHILD_VDEV) { | |
227 | for (i = BP_GET_NDVAS(zio->io_bp) - 1; i >= 0; i--) { | |
228 | dva_t *dva = &zio->io_bp->blk_dva[i]; | |
229 | uint64_t off = DVA_GET_OFFSET(dva); | |
230 | vdev_t *vd = vdev_lookup_top(zio->io_spa, | |
231 | DVA_GET_VDEV(dva)); | |
232 | ||
233 | /* Compensate for vdev label added to leaves */ | |
234 | if (zio->io_vd->vdev_ops->vdev_op_leaf) | |
235 | off += VDEV_LABEL_START_SIZE; | |
236 | ||
237 | if (zio->io_vd == vd && zio->io_offset == off) | |
238 | break; | |
239 | } | |
240 | } | |
241 | ||
242 | return (i); | |
243 | } | |
244 | ||
245 | ||
34dc7c2f BB |
246 | /* |
247 | * Determine if the I/O in question should return failure. Returns the errno | |
248 | * to be returned to the caller. | |
249 | */ | |
250 | int | |
251 | zio_handle_fault_injection(zio_t *zio, int error) | |
252 | { | |
253 | int ret = 0; | |
254 | inject_handler_t *handler; | |
255 | ||
256 | /* | |
257 | * Ignore I/O not associated with any logical data. | |
258 | */ | |
259 | if (zio->io_logical == NULL) | |
260 | return (0); | |
261 | ||
262 | /* | |
263 | * Currently, we only support fault injection on reads. | |
264 | */ | |
265 | if (zio->io_type != ZIO_TYPE_READ) | |
266 | return (0); | |
267 | ||
268 | rw_enter(&inject_lock, RW_READER); | |
269 | ||
270 | for (handler = list_head(&inject_handlers); handler != NULL; | |
271 | handler = list_next(&inject_handlers, handler)) { | |
cc92e9d0 GW |
272 | if (zio->io_spa != handler->zi_spa || |
273 | handler->zi_record.zi_cmd != ZINJECT_DATA_FAULT) | |
34dc7c2f BB |
274 | continue; |
275 | ||
ab7615d9 | 276 | /* If this handler matches, return the specified error */ |
34dc7c2f BB |
277 | if (zio_match_handler(&zio->io_logical->io_bookmark, |
278 | zio->io_bp ? BP_GET_TYPE(zio->io_bp) : DMU_OT_NONE, | |
ab7615d9 | 279 | zio_match_dva(zio), &handler->zi_record, error)) { |
34dc7c2f BB |
280 | ret = error; |
281 | break; | |
282 | } | |
283 | } | |
284 | ||
285 | rw_exit(&inject_lock); | |
286 | ||
287 | return (ret); | |
288 | } | |
289 | ||
b128c09f BB |
290 | /* |
291 | * Determine if the zio is part of a label update and has an injection | |
292 | * handler associated with that portion of the label. Currently, we | |
293 | * allow error injection in either the nvlist or the uberblock region of | |
294 | * of the vdev label. | |
295 | */ | |
296 | int | |
297 | zio_handle_label_injection(zio_t *zio, int error) | |
298 | { | |
299 | inject_handler_t *handler; | |
300 | vdev_t *vd = zio->io_vd; | |
301 | uint64_t offset = zio->io_offset; | |
302 | int label; | |
303 | int ret = 0; | |
304 | ||
428870ff | 305 | if (offset >= VDEV_LABEL_START_SIZE && |
b128c09f BB |
306 | offset < vd->vdev_psize - VDEV_LABEL_END_SIZE) |
307 | return (0); | |
308 | ||
309 | rw_enter(&inject_lock, RW_READER); | |
310 | ||
311 | for (handler = list_head(&inject_handlers); handler != NULL; | |
312 | handler = list_next(&inject_handlers, handler)) { | |
313 | uint64_t start = handler->zi_record.zi_start; | |
314 | uint64_t end = handler->zi_record.zi_end; | |
315 | ||
cc92e9d0 | 316 | if (handler->zi_record.zi_cmd != ZINJECT_LABEL_FAULT) |
b128c09f BB |
317 | continue; |
318 | ||
319 | /* | |
320 | * The injection region is the relative offsets within a | |
321 | * vdev label. We must determine the label which is being | |
322 | * updated and adjust our region accordingly. | |
323 | */ | |
324 | label = vdev_label_number(vd->vdev_psize, offset); | |
325 | start = vdev_label_offset(vd->vdev_psize, label, start); | |
326 | end = vdev_label_offset(vd->vdev_psize, label, end); | |
327 | ||
328 | if (zio->io_vd->vdev_guid == handler->zi_record.zi_guid && | |
329 | (offset >= start && offset <= end)) { | |
330 | ret = error; | |
331 | break; | |
332 | } | |
333 | } | |
334 | rw_exit(&inject_lock); | |
335 | return (ret); | |
336 | } | |
337 | ||
d977122d DB |
338 | /*ARGSUSED*/ |
339 | static int | |
340 | zio_inject_bitflip_cb(void *data, size_t len, void *private) | |
341 | { | |
342 | ASSERTV(zio_t *zio = private); | |
343 | uint8_t *buffer = data; | |
344 | uint_t byte = spa_get_random(len); | |
b128c09f | 345 | |
d977122d DB |
346 | ASSERT(zio->io_type == ZIO_TYPE_READ); |
347 | ||
348 | /* flip a single random bit in an abd data buffer */ | |
349 | buffer[byte] ^= 1 << spa_get_random(8); | |
350 | ||
351 | return (1); /* stop after first flip */ | |
352 | } | |
353 | ||
354 | static int | |
355 | zio_handle_device_injection_impl(vdev_t *vd, zio_t *zio, int err1, int err2) | |
34dc7c2f BB |
356 | { |
357 | inject_handler_t *handler; | |
358 | int ret = 0; | |
359 | ||
428870ff BB |
360 | /* |
361 | * We skip over faults in the labels unless it's during | |
362 | * device open (i.e. zio == NULL). | |
363 | */ | |
364 | if (zio != NULL) { | |
365 | uint64_t offset = zio->io_offset; | |
366 | ||
367 | if (offset < VDEV_LABEL_START_SIZE || | |
368 | offset >= vd->vdev_psize - VDEV_LABEL_END_SIZE) | |
369 | return (0); | |
370 | } | |
371 | ||
34dc7c2f BB |
372 | rw_enter(&inject_lock, RW_READER); |
373 | ||
374 | for (handler = list_head(&inject_handlers); handler != NULL; | |
375 | handler = list_next(&inject_handlers, handler)) { | |
376 | ||
cc92e9d0 | 377 | if (handler->zi_record.zi_cmd != ZINJECT_DEVICE_FAULT) |
b128c09f BB |
378 | continue; |
379 | ||
34dc7c2f | 380 | if (vd->vdev_guid == handler->zi_record.zi_guid) { |
9babb374 BB |
381 | if (handler->zi_record.zi_failfast && |
382 | (zio == NULL || (zio->io_flags & | |
383 | (ZIO_FLAG_IO_RETRY | ZIO_FLAG_TRYHARD)))) { | |
384 | continue; | |
385 | } | |
386 | ||
428870ff BB |
387 | /* Handle type specific I/O failures */ |
388 | if (zio != NULL && | |
389 | handler->zi_record.zi_iotype != ZIO_TYPES && | |
390 | handler->zi_record.zi_iotype != zio->io_type) | |
391 | continue; | |
392 | ||
d977122d DB |
393 | if (handler->zi_record.zi_error == err1 || |
394 | handler->zi_record.zi_error == err2) { | |
0241e491 DB |
395 | /* |
396 | * limit error injection if requested | |
397 | */ | |
398 | if (!freq_triggered(handler->zi_record.zi_freq)) | |
399 | continue; | |
400 | ||
34dc7c2f BB |
401 | /* |
402 | * For a failed open, pretend like the device | |
403 | * has gone away. | |
404 | */ | |
d977122d | 405 | if (err1 == ENXIO) |
34dc7c2f BB |
406 | vd->vdev_stat.vs_aux = |
407 | VDEV_AUX_OPEN_FAILED; | |
428870ff BB |
408 | |
409 | /* | |
410 | * Treat these errors as if they had been | |
411 | * retried so that all the appropriate stats | |
412 | * and FMA events are generated. | |
413 | */ | |
414 | if (!handler->zi_record.zi_failfast && | |
415 | zio != NULL) | |
416 | zio->io_flags |= ZIO_FLAG_IO_RETRY; | |
417 | ||
d977122d DB |
418 | /* |
419 | * EILSEQ means flip a bit after a read | |
420 | */ | |
421 | if (handler->zi_record.zi_error == EILSEQ) { | |
422 | if (zio == NULL) | |
423 | break; | |
424 | ||
425 | /* locate buffer data and flip a bit */ | |
426 | (void) abd_iterate_func(zio->io_abd, 0, | |
427 | zio->io_size, zio_inject_bitflip_cb, | |
428 | zio); | |
429 | break; | |
430 | } | |
431 | ||
432 | ret = handler->zi_record.zi_error; | |
34dc7c2f BB |
433 | break; |
434 | } | |
435 | if (handler->zi_record.zi_error == ENXIO) { | |
2e528b49 | 436 | ret = SET_ERROR(EIO); |
34dc7c2f BB |
437 | break; |
438 | } | |
439 | } | |
440 | } | |
441 | ||
442 | rw_exit(&inject_lock); | |
443 | ||
444 | return (ret); | |
445 | } | |
446 | ||
d977122d DB |
447 | int |
448 | zio_handle_device_injection(vdev_t *vd, zio_t *zio, int error) | |
449 | { | |
450 | return (zio_handle_device_injection_impl(vd, zio, error, INT_MAX)); | |
451 | } | |
452 | ||
453 | int | |
454 | zio_handle_device_injections(vdev_t *vd, zio_t *zio, int err1, int err2) | |
455 | { | |
456 | return (zio_handle_device_injection_impl(vd, zio, err1, err2)); | |
457 | } | |
458 | ||
428870ff BB |
459 | /* |
460 | * Simulate hardware that ignores cache flushes. For requested number | |
461 | * of seconds nix the actual writing to disk. | |
462 | */ | |
463 | void | |
464 | zio_handle_ignored_writes(zio_t *zio) | |
465 | { | |
466 | inject_handler_t *handler; | |
467 | ||
468 | rw_enter(&inject_lock, RW_READER); | |
469 | ||
470 | for (handler = list_head(&inject_handlers); handler != NULL; | |
471 | handler = list_next(&inject_handlers, handler)) { | |
472 | ||
473 | /* Ignore errors not destined for this pool */ | |
cc92e9d0 GW |
474 | if (zio->io_spa != handler->zi_spa || |
475 | handler->zi_record.zi_cmd != ZINJECT_IGNORED_WRITES) | |
428870ff BB |
476 | continue; |
477 | ||
478 | /* | |
479 | * Positive duration implies # of seconds, negative | |
480 | * a number of txgs | |
481 | */ | |
482 | if (handler->zi_record.zi_timer == 0) { | |
483 | if (handler->zi_record.zi_duration > 0) | |
484 | handler->zi_record.zi_timer = ddi_get_lbolt64(); | |
485 | else | |
486 | handler->zi_record.zi_timer = zio->io_txg; | |
487 | } | |
488 | ||
489 | /* Have a "problem" writing 60% of the time */ | |
490 | if (spa_get_random(100) < 60) | |
491 | zio->io_pipeline &= ~ZIO_VDEV_IO_STAGES; | |
492 | break; | |
493 | } | |
494 | ||
495 | rw_exit(&inject_lock); | |
496 | } | |
497 | ||
498 | void | |
499 | spa_handle_ignored_writes(spa_t *spa) | |
500 | { | |
501 | inject_handler_t *handler; | |
502 | ||
503 | if (zio_injection_enabled == 0) | |
504 | return; | |
505 | ||
506 | rw_enter(&inject_lock, RW_READER); | |
507 | ||
508 | for (handler = list_head(&inject_handlers); handler != NULL; | |
509 | handler = list_next(&inject_handlers, handler)) { | |
510 | ||
cc92e9d0 GW |
511 | if (spa != handler->zi_spa || |
512 | handler->zi_record.zi_cmd != ZINJECT_IGNORED_WRITES) | |
428870ff BB |
513 | continue; |
514 | ||
515 | if (handler->zi_record.zi_duration > 0) { | |
516 | VERIFY(handler->zi_record.zi_timer == 0 || | |
0b75bdb3 CC |
517 | ddi_time_after64( |
518 | (int64_t)handler->zi_record.zi_timer + | |
519 | handler->zi_record.zi_duration * hz, | |
520 | ddi_get_lbolt64())); | |
428870ff BB |
521 | } else { |
522 | /* duration is negative so the subtraction here adds */ | |
523 | VERIFY(handler->zi_record.zi_timer == 0 || | |
524 | handler->zi_record.zi_timer - | |
525 | handler->zi_record.zi_duration >= | |
526 | spa_syncing_txg(spa)); | |
527 | } | |
528 | } | |
529 | ||
530 | rw_exit(&inject_lock); | |
531 | } | |
532 | ||
26ef0cc7 | 533 | hrtime_t |
cc92e9d0 GW |
534 | zio_handle_io_delay(zio_t *zio) |
535 | { | |
536 | vdev_t *vd = zio->io_vd; | |
26ef0cc7 TH |
537 | inject_handler_t *min_handler = NULL; |
538 | hrtime_t min_target = 0; | |
cc92e9d0 GW |
539 | |
540 | rw_enter(&inject_lock, RW_READER); | |
541 | ||
26ef0cc7 TH |
542 | /* |
543 | * inject_delay_count is a subset of zio_injection_enabled that | |
544 | * is only incremented for delay handlers. These checks are | |
545 | * mainly added to remind the reader why we're not explicitly | |
546 | * checking zio_injection_enabled like the other functions. | |
547 | */ | |
548 | IMPLY(inject_delay_count > 0, zio_injection_enabled > 0); | |
549 | IMPLY(zio_injection_enabled == 0, inject_delay_count == 0); | |
550 | ||
551 | /* | |
552 | * If there aren't any inject delay handlers registered, then we | |
553 | * can short circuit and simply return 0 here. A value of zero | |
554 | * informs zio_delay_interrupt() that this request should not be | |
555 | * delayed. This short circuit keeps us from acquiring the | |
556 | * inject_delay_mutex unnecessarily. | |
557 | */ | |
558 | if (inject_delay_count == 0) { | |
559 | rw_exit(&inject_lock); | |
560 | return (0); | |
561 | } | |
562 | ||
563 | /* | |
564 | * Each inject handler has a number of "lanes" associated with | |
565 | * it. Each lane is able to handle requests independently of one | |
566 | * another, and at a latency defined by the inject handler | |
567 | * record's zi_timer field. Thus if a handler in configured with | |
568 | * a single lane with a 10ms latency, it will delay requests | |
569 | * such that only a single request is completed every 10ms. So, | |
570 | * if more than one request is attempted per each 10ms interval, | |
571 | * the average latency of the requests will be greater than | |
572 | * 10ms; but if only a single request is submitted each 10ms | |
573 | * interval the average latency will be 10ms. | |
574 | * | |
575 | * We need to acquire this mutex to prevent multiple concurrent | |
576 | * threads being assigned to the same lane of a given inject | |
577 | * handler. The mutex allows us to perform the following two | |
578 | * operations atomically: | |
579 | * | |
580 | * 1. determine the minimum handler and minimum target | |
581 | * value of all the possible handlers | |
582 | * 2. update that minimum handler's lane array | |
583 | * | |
584 | * Without atomicity, two (or more) threads could pick the same | |
585 | * lane in step (1), and then conflict with each other in step | |
586 | * (2). This could allow a single lane handler to process | |
587 | * multiple requests simultaneously, which shouldn't be possible. | |
588 | */ | |
589 | mutex_enter(&inject_delay_mtx); | |
cc92e9d0 | 590 | |
1c27024e | 591 | for (inject_handler_t *handler = list_head(&inject_handlers); |
26ef0cc7 | 592 | handler != NULL; handler = list_next(&inject_handlers, handler)) { |
cc92e9d0 GW |
593 | if (handler->zi_record.zi_cmd != ZINJECT_DELAY_IO) |
594 | continue; | |
595 | ||
0241e491 | 596 | if (!freq_triggered(handler->zi_record.zi_freq)) |
c35b1882 BB |
597 | continue; |
598 | ||
26ef0cc7 TH |
599 | if (vd->vdev_guid != handler->zi_record.zi_guid) |
600 | continue; | |
601 | ||
602 | /* | |
603 | * Defensive; should never happen as the array allocation | |
604 | * occurs prior to inserting this handler on the list. | |
605 | */ | |
606 | ASSERT3P(handler->zi_lanes, !=, NULL); | |
607 | ||
608 | /* | |
609 | * This should never happen, the zinject command should | |
610 | * prevent a user from setting an IO delay with zero lanes. | |
611 | */ | |
612 | ASSERT3U(handler->zi_record.zi_nlanes, !=, 0); | |
613 | ||
614 | ASSERT3U(handler->zi_record.zi_nlanes, >, | |
615 | handler->zi_next_lane); | |
616 | ||
617 | /* | |
618 | * We want to issue this IO to the lane that will become | |
619 | * idle the soonest, so we compare the soonest this | |
620 | * specific handler can complete the IO with all other | |
621 | * handlers, to find the lowest value of all possible | |
622 | * lanes. We then use this lane to submit the request. | |
623 | * | |
624 | * Since each handler has a constant value for its | |
625 | * delay, we can just use the "next" lane for that | |
626 | * handler; as it will always be the lane with the | |
627 | * lowest value for that particular handler (i.e. the | |
628 | * lane that will become idle the soonest). This saves a | |
629 | * scan of each handler's lanes array. | |
630 | * | |
631 | * There's two cases to consider when determining when | |
632 | * this specific IO request should complete. If this | |
633 | * lane is idle, we want to "submit" the request now so | |
634 | * it will complete after zi_timer milliseconds. Thus, | |
635 | * we set the target to now + zi_timer. | |
636 | * | |
637 | * If the lane is busy, we want this request to complete | |
638 | * zi_timer milliseconds after the lane becomes idle. | |
639 | * Since the 'zi_lanes' array holds the time at which | |
640 | * each lane will become idle, we use that value to | |
641 | * determine when this request should complete. | |
642 | */ | |
1c27024e DB |
643 | hrtime_t idle = handler->zi_record.zi_timer + gethrtime(); |
644 | hrtime_t busy = handler->zi_record.zi_timer + | |
26ef0cc7 | 645 | handler->zi_lanes[handler->zi_next_lane]; |
1c27024e | 646 | hrtime_t target = MAX(idle, busy); |
26ef0cc7 TH |
647 | |
648 | if (min_handler == NULL) { | |
649 | min_handler = handler; | |
650 | min_target = target; | |
651 | continue; | |
cc92e9d0 GW |
652 | } |
653 | ||
26ef0cc7 TH |
654 | ASSERT3P(min_handler, !=, NULL); |
655 | ASSERT3U(min_target, !=, 0); | |
656 | ||
657 | /* | |
658 | * We don't yet increment the "next lane" variable since | |
659 | * we still might find a lower value lane in another | |
660 | * handler during any remaining iterations. Once we're | |
661 | * sure we've selected the absolute minimum, we'll claim | |
662 | * the lane and increment the handler's "next lane" | |
663 | * field below. | |
664 | */ | |
665 | ||
666 | if (target < min_target) { | |
667 | min_handler = handler; | |
668 | min_target = target; | |
669 | } | |
cc92e9d0 | 670 | } |
26ef0cc7 TH |
671 | |
672 | /* | |
673 | * 'min_handler' will be NULL if no IO delays are registered for | |
674 | * this vdev, otherwise it will point to the handler containing | |
675 | * the lane that will become idle the soonest. | |
676 | */ | |
677 | if (min_handler != NULL) { | |
678 | ASSERT3U(min_target, !=, 0); | |
679 | min_handler->zi_lanes[min_handler->zi_next_lane] = min_target; | |
680 | ||
681 | /* | |
682 | * If we've used all possible lanes for this handler, | |
683 | * loop back and start using the first lane again; | |
684 | * otherwise, just increment the lane index. | |
685 | */ | |
686 | min_handler->zi_next_lane = (min_handler->zi_next_lane + 1) % | |
687 | min_handler->zi_record.zi_nlanes; | |
688 | } | |
689 | ||
690 | mutex_exit(&inject_delay_mtx); | |
cc92e9d0 | 691 | rw_exit(&inject_lock); |
26ef0cc7 TH |
692 | |
693 | return (min_target); | |
cc92e9d0 GW |
694 | } |
695 | ||
e89f1295 DB |
696 | static int |
697 | zio_calculate_range(const char *pool, zinject_record_t *record) | |
698 | { | |
699 | dsl_pool_t *dp; | |
700 | dsl_dataset_t *ds; | |
701 | objset_t *os = NULL; | |
702 | dnode_t *dn = NULL; | |
703 | int error; | |
704 | ||
705 | /* | |
706 | * Obtain the dnode for object using pool, objset, and object | |
707 | */ | |
708 | error = dsl_pool_hold(pool, FTAG, &dp); | |
709 | if (error) | |
710 | return (error); | |
711 | ||
712 | error = dsl_dataset_hold_obj(dp, record->zi_objset, FTAG, &ds); | |
713 | dsl_pool_rele(dp, FTAG); | |
714 | if (error) | |
715 | return (error); | |
716 | ||
717 | error = dmu_objset_from_ds(ds, &os); | |
718 | dsl_dataset_rele(ds, FTAG); | |
719 | if (error) | |
720 | return (error); | |
721 | ||
722 | error = dnode_hold(os, record->zi_object, FTAG, &dn); | |
723 | if (error) | |
724 | return (error); | |
725 | ||
726 | /* | |
727 | * Translate the range into block IDs | |
728 | */ | |
729 | if (record->zi_start != 0 || record->zi_end != -1ULL) { | |
730 | record->zi_start >>= dn->dn_datablkshift; | |
731 | record->zi_end >>= dn->dn_datablkshift; | |
732 | } | |
733 | if (record->zi_level > 0) { | |
734 | if (record->zi_level >= dn->dn_nlevels) { | |
735 | dnode_rele(dn, FTAG); | |
736 | return (SET_ERROR(EDOM)); | |
737 | } | |
738 | ||
739 | if (record->zi_start != 0 || record->zi_end != 0) { | |
740 | int shift = dn->dn_indblkshift - SPA_BLKPTRSHIFT; | |
741 | ||
742 | for (int level = record->zi_level; level > 0; level--) { | |
743 | record->zi_start >>= shift; | |
744 | record->zi_end >>= shift; | |
745 | } | |
746 | } | |
747 | } | |
748 | ||
749 | dnode_rele(dn, FTAG); | |
750 | return (0); | |
751 | } | |
752 | ||
34dc7c2f BB |
753 | /* |
754 | * Create a new handler for the given record. We add it to the list, adding | |
755 | * a reference to the spa_t in the process. We increment zio_injection_enabled, | |
756 | * which is the switch to trigger all fault injection. | |
757 | */ | |
758 | int | |
759 | zio_inject_fault(char *name, int flags, int *id, zinject_record_t *record) | |
760 | { | |
761 | inject_handler_t *handler; | |
762 | int error; | |
763 | spa_t *spa; | |
764 | ||
765 | /* | |
766 | * If this is pool-wide metadata, make sure we unload the corresponding | |
767 | * spa_t, so that the next attempt to load it will trigger the fault. | |
768 | * We call spa_reset() to unload the pool appropriately. | |
769 | */ | |
770 | if (flags & ZINJECT_UNLOAD_SPA) | |
771 | if ((error = spa_reset(name)) != 0) | |
772 | return (error); | |
773 | ||
26ef0cc7 TH |
774 | if (record->zi_cmd == ZINJECT_DELAY_IO) { |
775 | /* | |
776 | * A value of zero for the number of lanes or for the | |
777 | * delay time doesn't make sense. | |
778 | */ | |
779 | if (record->zi_timer == 0 || record->zi_nlanes == 0) | |
780 | return (SET_ERROR(EINVAL)); | |
781 | ||
782 | /* | |
783 | * The number of lanes is directly mapped to the size of | |
784 | * an array used by the handler. Thus, to ensure the | |
785 | * user doesn't trigger an allocation that's "too large" | |
786 | * we cap the number of lanes here. | |
787 | */ | |
788 | if (record->zi_nlanes >= UINT16_MAX) | |
789 | return (SET_ERROR(EINVAL)); | |
790 | } | |
791 | ||
e89f1295 DB |
792 | /* |
793 | * If the supplied range was in bytes -- calculate the actual blkid | |
794 | */ | |
795 | if (flags & ZINJECT_CALC_RANGE) { | |
796 | error = zio_calculate_range(name, record); | |
797 | if (error != 0) | |
798 | return (error); | |
799 | } | |
800 | ||
34dc7c2f BB |
801 | if (!(flags & ZINJECT_NULL)) { |
802 | /* | |
803 | * spa_inject_ref() will add an injection reference, which will | |
804 | * prevent the pool from being removed from the namespace while | |
805 | * still allowing it to be unloaded. | |
806 | */ | |
807 | if ((spa = spa_inject_addref(name)) == NULL) | |
2e528b49 | 808 | return (SET_ERROR(ENOENT)); |
34dc7c2f BB |
809 | |
810 | handler = kmem_alloc(sizeof (inject_handler_t), KM_SLEEP); | |
811 | ||
26ef0cc7 TH |
812 | handler->zi_spa = spa; |
813 | handler->zi_record = *record; | |
814 | ||
815 | if (handler->zi_record.zi_cmd == ZINJECT_DELAY_IO) { | |
816 | handler->zi_lanes = kmem_zalloc( | |
817 | sizeof (*handler->zi_lanes) * | |
818 | handler->zi_record.zi_nlanes, KM_SLEEP); | |
819 | handler->zi_next_lane = 0; | |
820 | } else { | |
821 | handler->zi_lanes = NULL; | |
822 | handler->zi_next_lane = 0; | |
823 | } | |
824 | ||
34dc7c2f BB |
825 | rw_enter(&inject_lock, RW_WRITER); |
826 | ||
26ef0cc7 TH |
827 | /* |
828 | * We can't move this increment into the conditional | |
829 | * above because we need to hold the RW_WRITER lock of | |
830 | * inject_lock, and we don't want to hold that while | |
831 | * allocating the handler's zi_lanes array. | |
832 | */ | |
833 | if (handler->zi_record.zi_cmd == ZINJECT_DELAY_IO) { | |
834 | ASSERT3S(inject_delay_count, >=, 0); | |
835 | inject_delay_count++; | |
836 | ASSERT3S(inject_delay_count, >, 0); | |
837 | } | |
838 | ||
34dc7c2f | 839 | *id = handler->zi_id = inject_next_id++; |
34dc7c2f | 840 | list_insert_tail(&inject_handlers, handler); |
bc89ac84 | 841 | atomic_inc_32(&zio_injection_enabled); |
34dc7c2f BB |
842 | |
843 | rw_exit(&inject_lock); | |
844 | } | |
845 | ||
846 | /* | |
847 | * Flush the ARC, so that any attempts to read this data will end up | |
848 | * going to the ZIO layer. Note that this is a little overkill, but | |
849 | * we don't have the necessary ARC interfaces to do anything else, and | |
850 | * fault injection isn't a performance critical path. | |
851 | */ | |
852 | if (flags & ZINJECT_FLUSH_ARC) | |
ca0bf58d PS |
853 | /* |
854 | * We must use FALSE to ensure arc_flush returns, since | |
855 | * we're not preventing concurrent ARC insertions. | |
856 | */ | |
857 | arc_flush(NULL, FALSE); | |
34dc7c2f BB |
858 | |
859 | return (0); | |
860 | } | |
861 | ||
862 | /* | |
863 | * Returns the next record with an ID greater than that supplied to the | |
864 | * function. Used to iterate over all handlers in the system. | |
865 | */ | |
866 | int | |
867 | zio_inject_list_next(int *id, char *name, size_t buflen, | |
868 | zinject_record_t *record) | |
869 | { | |
870 | inject_handler_t *handler; | |
871 | int ret; | |
872 | ||
873 | mutex_enter(&spa_namespace_lock); | |
874 | rw_enter(&inject_lock, RW_READER); | |
875 | ||
876 | for (handler = list_head(&inject_handlers); handler != NULL; | |
877 | handler = list_next(&inject_handlers, handler)) | |
878 | if (handler->zi_id > *id) | |
879 | break; | |
880 | ||
881 | if (handler) { | |
882 | *record = handler->zi_record; | |
883 | *id = handler->zi_id; | |
884 | (void) strncpy(name, spa_name(handler->zi_spa), buflen); | |
885 | ret = 0; | |
886 | } else { | |
2e528b49 | 887 | ret = SET_ERROR(ENOENT); |
34dc7c2f BB |
888 | } |
889 | ||
890 | rw_exit(&inject_lock); | |
891 | mutex_exit(&spa_namespace_lock); | |
892 | ||
893 | return (ret); | |
894 | } | |
895 | ||
896 | /* | |
897 | * Clear the fault handler with the given identifier, or return ENOENT if none | |
898 | * exists. | |
899 | */ | |
900 | int | |
901 | zio_clear_fault(int id) | |
902 | { | |
903 | inject_handler_t *handler; | |
34dc7c2f BB |
904 | |
905 | rw_enter(&inject_lock, RW_WRITER); | |
906 | ||
907 | for (handler = list_head(&inject_handlers); handler != NULL; | |
908 | handler = list_next(&inject_handlers, handler)) | |
909 | if (handler->zi_id == id) | |
910 | break; | |
911 | ||
912 | if (handler == NULL) { | |
572e2857 | 913 | rw_exit(&inject_lock); |
2e528b49 | 914 | return (SET_ERROR(ENOENT)); |
34dc7c2f BB |
915 | } |
916 | ||
26ef0cc7 TH |
917 | if (handler->zi_record.zi_cmd == ZINJECT_DELAY_IO) { |
918 | ASSERT3S(inject_delay_count, >, 0); | |
919 | inject_delay_count--; | |
920 | ASSERT3S(inject_delay_count, >=, 0); | |
921 | } | |
922 | ||
572e2857 | 923 | list_remove(&inject_handlers, handler); |
34dc7c2f BB |
924 | rw_exit(&inject_lock); |
925 | ||
26ef0cc7 TH |
926 | if (handler->zi_record.zi_cmd == ZINJECT_DELAY_IO) { |
927 | ASSERT3P(handler->zi_lanes, !=, NULL); | |
928 | kmem_free(handler->zi_lanes, sizeof (*handler->zi_lanes) * | |
929 | handler->zi_record.zi_nlanes); | |
930 | } else { | |
931 | ASSERT3P(handler->zi_lanes, ==, NULL); | |
932 | } | |
933 | ||
572e2857 BB |
934 | spa_inject_delref(handler->zi_spa); |
935 | kmem_free(handler, sizeof (inject_handler_t)); | |
bc89ac84 | 936 | atomic_dec_32(&zio_injection_enabled); |
572e2857 BB |
937 | |
938 | return (0); | |
34dc7c2f BB |
939 | } |
940 | ||
941 | void | |
942 | zio_inject_init(void) | |
943 | { | |
b128c09f | 944 | rw_init(&inject_lock, NULL, RW_DEFAULT, NULL); |
26ef0cc7 | 945 | mutex_init(&inject_delay_mtx, NULL, MUTEX_DEFAULT, NULL); |
34dc7c2f BB |
946 | list_create(&inject_handlers, sizeof (inject_handler_t), |
947 | offsetof(inject_handler_t, zi_link)); | |
948 | } | |
949 | ||
950 | void | |
951 | zio_inject_fini(void) | |
952 | { | |
953 | list_destroy(&inject_handlers); | |
26ef0cc7 | 954 | mutex_destroy(&inject_delay_mtx); |
b128c09f | 955 | rw_destroy(&inject_lock); |
34dc7c2f | 956 | } |
c409e464 | 957 | |
93ce2b4c | 958 | #if defined(_KERNEL) |
e89bd697 IH |
959 | EXPORT_SYMBOL(zio_injection_enabled); |
960 | EXPORT_SYMBOL(zio_inject_fault); | |
961 | EXPORT_SYMBOL(zio_inject_list_next); | |
962 | EXPORT_SYMBOL(zio_clear_fault); | |
963 | EXPORT_SYMBOL(zio_handle_fault_injection); | |
964 | EXPORT_SYMBOL(zio_handle_device_injection); | |
965 | EXPORT_SYMBOL(zio_handle_label_injection); | |
c409e464 | 966 | #endif |