]>
Commit | Line | Data |
---|---|---|
34dc7c2f BB |
1 | /* |
2 | * CDDL HEADER START | |
3 | * | |
4 | * The contents of this file are subject to the terms of the | |
5 | * Common Development and Distribution License (the "License"). | |
6 | * You may not use this file except in compliance with the License. | |
7 | * | |
8 | * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE | |
1d3ba0bf | 9 | * or https://opensource.org/licenses/CDDL-1.0. |
34dc7c2f BB |
10 | * See the License for the specific language governing permissions |
11 | * and limitations under the License. | |
12 | * | |
13 | * When distributing Covered Code, include this CDDL HEADER in each | |
14 | * file and include the License file at usr/src/OPENSOLARIS.LICENSE. | |
15 | * If applicable, add the following below this CDDL HEADER, with the | |
16 | * fields enclosed by brackets "[]" replaced with your own identifying | |
17 | * information: Portions Copyright [yyyy] [name of copyright owner] | |
18 | * | |
19 | * CDDL HEADER END | |
20 | */ | |
21 | /* | |
9babb374 | 22 | * Copyright 2009 Sun Microsystems, Inc. All rights reserved. |
34dc7c2f BB |
23 | * Use is subject to license terms. |
24 | */ | |
25 | ||
5ffb9d1d | 26 | /* |
03e02e5b | 27 | * Copyright (c) 2012,2021 by Delphix. All rights reserved. |
5ffb9d1d GW |
28 | */ |
29 | ||
34dc7c2f BB |
30 | #include <sys/spa.h> |
31 | #include <sys/spa_impl.h> | |
32 | #include <sys/vdev.h> | |
33 | #include <sys/vdev_impl.h> | |
34 | #include <sys/zio.h> | |
428870ff | 35 | #include <sys/zio_checksum.h> |
34dc7c2f BB |
36 | |
37 | #include <sys/fm/fs/zfs.h> | |
38 | #include <sys/fm/protocol.h> | |
39 | #include <sys/fm/util.h> | |
40 | #include <sys/sysevent.h> | |
41 | ||
42 | /* | |
43 | * This general routine is responsible for generating all the different ZFS | |
44 | * ereports. The payload is dependent on the class, and which arguments are | |
45 | * supplied to the function: | |
46 | * | |
47 | * EREPORT POOL VDEV IO | |
48 | * block X X X | |
49 | * data X X | |
50 | * device X X | |
51 | * pool X | |
52 | * | |
53 | * If we are in a loading state, all errors are chained together by the same | |
b128c09f | 54 | * SPA-wide ENA (Error Numeric Association). |
34dc7c2f BB |
55 | * |
56 | * For isolated I/O requests, we get the ENA from the zio_t. The propagation | |
57 | * gets very complicated due to RAID-Z, gang blocks, and vdev caching. We want | |
58 | * to chain together all ereports associated with a logical piece of data. For | |
59 | * read I/Os, there are basically three 'types' of I/O, which form a roughly | |
60 | * layered diagram: | |
61 | * | |
14e4e3cb | 62 | * +---------------+ |
34dc7c2f BB |
63 | * | Aggregate I/O | No associated logical data or device |
64 | * +---------------+ | |
65 | * | | |
66 | * V | |
67 | * +---------------+ Reads associated with a piece of logical data. | |
68 | * | Read I/O | This includes reads on behalf of RAID-Z, | |
69 | * +---------------+ mirrors, gang blocks, retries, etc. | |
70 | * | | |
71 | * V | |
72 | * +---------------+ Reads associated with a particular device, but | |
73 | * | Physical I/O | no logical data. Issued as part of vdev caching | |
74 | * +---------------+ and I/O aggregation. | |
75 | * | |
76 | * Note that 'physical I/O' here is not the same terminology as used in the rest | |
77 | * of ZIO. Typically, 'physical I/O' simply means that there is no attached | |
78 | * blockpointer. But I/O with no associated block pointer can still be related | |
79 | * to a logical piece of data (i.e. RAID-Z requests). | |
80 | * | |
81 | * Purely physical I/O always have unique ENAs. They are not related to a | |
82 | * particular piece of logical data, and therefore cannot be chained together. | |
83 | * We still generate an ereport, but the DE doesn't correlate it with any | |
84 | * logical piece of data. When such an I/O fails, the delegated I/O requests | |
85 | * will issue a retry, which will trigger the 'real' ereport with the correct | |
86 | * ENA. | |
87 | * | |
88 | * We keep track of the ENA for a ZIO chain through the 'io_logical' member. | |
89 | * When a new logical I/O is issued, we set this to point to itself. Child I/Os | |
90 | * then inherit this pointer, so that when it is first set subsequent failures | |
b128c09f BB |
91 | * will use the same ENA. For vdev cache fill and queue aggregation I/O, |
92 | * this pointer is set to NULL, and no ereport will be generated (since it | |
93 | * doesn't actually correspond to any particular device or piece of data, | |
94 | * and the caller will always retry without caching or queueing anyway). | |
428870ff BB |
95 | * |
96 | * For checksum errors, we want to include more information about the actual | |
97 | * error which occurs. Accordingly, we build an ereport when the error is | |
98 | * noticed, but instead of sending it in immediately, we hang it off of the | |
99 | * io_cksum_report field of the logical IO. When the logical IO completes | |
100 | * (successfully or not), zfs_ereport_finish_checksum() is called with the | |
101 | * good and bad versions of the buffer (if available), and we annotate the | |
102 | * ereport with information about the differences. | |
34dc7c2f | 103 | */ |
4f072827 | 104 | |
428870ff | 105 | #ifdef _KERNEL |
4f072827 DB |
106 | /* |
107 | * Duplicate ereport Detection | |
108 | * | |
109 | * Some ereports are retained momentarily for detecting duplicates. These | |
110 | * are kept in a recent_events_node_t in both a time-ordered list and an AVL | |
111 | * tree of recent unique ereports. | |
112 | * | |
113 | * The lifespan of these recent ereports is bounded (15 mins) and a cleaner | |
114 | * task is used to purge stale entries. | |
115 | */ | |
116 | static list_t recent_events_list; | |
117 | static avl_tree_t recent_events_tree; | |
118 | static kmutex_t recent_events_lock; | |
119 | static taskqid_t recent_events_cleaner_tqid; | |
120 | ||
121 | /* | |
122 | * Each node is about 128 bytes so 2,000 would consume 1/4 MiB. | |
123 | * | |
124 | * This setting can be changed dynamically and setting it to zero | |
125 | * disables duplicate detection. | |
126 | */ | |
18168da7 | 127 | static unsigned int zfs_zevent_retain_max = 2000; |
4f072827 DB |
128 | |
129 | /* | |
130 | * The lifespan for a recent ereport entry. The default of 15 minutes is | |
131 | * intended to outlive the zfs diagnosis engine's threshold of 10 errors | |
132 | * over a period of 10 minutes. | |
133 | */ | |
18168da7 | 134 | static unsigned int zfs_zevent_retain_expire_secs = 900; |
4f072827 DB |
135 | |
136 | typedef enum zfs_subclass { | |
137 | ZSC_IO, | |
138 | ZSC_DATA, | |
139 | ZSC_CHECKSUM | |
140 | } zfs_subclass_t; | |
141 | ||
142 | typedef struct { | |
143 | /* common criteria */ | |
144 | uint64_t re_pool_guid; | |
145 | uint64_t re_vdev_guid; | |
146 | int re_io_error; | |
147 | uint64_t re_io_size; | |
148 | uint64_t re_io_offset; | |
149 | zfs_subclass_t re_subclass; | |
150 | zio_priority_t re_io_priority; | |
151 | ||
152 | /* logical zio criteria (optional) */ | |
153 | zbookmark_phys_t re_io_bookmark; | |
154 | ||
155 | /* internal state */ | |
156 | avl_node_t re_tree_link; | |
157 | list_node_t re_list_link; | |
158 | uint64_t re_timestamp; | |
159 | } recent_events_node_t; | |
160 | ||
161 | static int | |
162 | recent_events_compare(const void *a, const void *b) | |
163 | { | |
164 | const recent_events_node_t *node1 = a; | |
165 | const recent_events_node_t *node2 = b; | |
166 | int cmp; | |
167 | ||
168 | /* | |
169 | * The comparison order here is somewhat arbitrary. | |
170 | * What's important is that if every criteria matches, then it | |
171 | * is a duplicate (i.e. compare returns 0) | |
172 | */ | |
173 | if ((cmp = TREE_CMP(node1->re_subclass, node2->re_subclass)) != 0) | |
174 | return (cmp); | |
175 | if ((cmp = TREE_CMP(node1->re_pool_guid, node2->re_pool_guid)) != 0) | |
176 | return (cmp); | |
177 | if ((cmp = TREE_CMP(node1->re_vdev_guid, node2->re_vdev_guid)) != 0) | |
178 | return (cmp); | |
179 | if ((cmp = TREE_CMP(node1->re_io_error, node2->re_io_error)) != 0) | |
180 | return (cmp); | |
181 | if ((cmp = TREE_CMP(node1->re_io_priority, node2->re_io_priority)) != 0) | |
182 | return (cmp); | |
183 | if ((cmp = TREE_CMP(node1->re_io_size, node2->re_io_size)) != 0) | |
184 | return (cmp); | |
185 | if ((cmp = TREE_CMP(node1->re_io_offset, node2->re_io_offset)) != 0) | |
186 | return (cmp); | |
187 | ||
188 | const zbookmark_phys_t *zb1 = &node1->re_io_bookmark; | |
189 | const zbookmark_phys_t *zb2 = &node2->re_io_bookmark; | |
190 | ||
191 | if ((cmp = TREE_CMP(zb1->zb_objset, zb2->zb_objset)) != 0) | |
192 | return (cmp); | |
193 | if ((cmp = TREE_CMP(zb1->zb_object, zb2->zb_object)) != 0) | |
194 | return (cmp); | |
195 | if ((cmp = TREE_CMP(zb1->zb_level, zb2->zb_level)) != 0) | |
196 | return (cmp); | |
197 | if ((cmp = TREE_CMP(zb1->zb_blkid, zb2->zb_blkid)) != 0) | |
198 | return (cmp); | |
199 | ||
200 | return (0); | |
201 | } | |
202 | ||
69f024a5 RW |
203 | /* |
204 | * workaround: vdev properties don't have inheritance | |
205 | */ | |
206 | static uint64_t | |
207 | vdev_prop_get_inherited(vdev_t *vd, vdev_prop_t prop) | |
208 | { | |
209 | uint64_t propdef, propval; | |
210 | ||
211 | propdef = vdev_prop_default_numeric(prop); | |
212 | switch (prop) { | |
213 | case VDEV_PROP_CHECKSUM_N: | |
214 | propval = vd->vdev_checksum_n; | |
215 | break; | |
216 | case VDEV_PROP_CHECKSUM_T: | |
217 | propval = vd->vdev_checksum_t; | |
218 | break; | |
219 | case VDEV_PROP_IO_N: | |
220 | propval = vd->vdev_io_n; | |
221 | break; | |
222 | case VDEV_PROP_IO_T: | |
223 | propval = vd->vdev_io_t; | |
224 | break; | |
225 | default: | |
226 | propval = propdef; | |
227 | break; | |
228 | } | |
229 | ||
230 | if (propval != propdef) | |
231 | return (propval); | |
232 | ||
233 | if (vd->vdev_parent == NULL) | |
234 | return (propdef); | |
235 | ||
236 | return (vdev_prop_get_inherited(vd->vdev_parent, prop)); | |
237 | } | |
238 | ||
4f072827 DB |
239 | static void zfs_ereport_schedule_cleaner(void); |
240 | ||
241 | /* | |
242 | * background task to clean stale recent event nodes. | |
243 | */ | |
4f072827 DB |
244 | static void |
245 | zfs_ereport_cleaner(void *arg) | |
246 | { | |
247 | recent_events_node_t *entry; | |
248 | uint64_t now = gethrtime(); | |
249 | ||
250 | /* | |
251 | * purge expired entries | |
252 | */ | |
253 | mutex_enter(&recent_events_lock); | |
254 | while ((entry = list_tail(&recent_events_list)) != NULL) { | |
255 | uint64_t age = NSEC2SEC(now - entry->re_timestamp); | |
256 | if (age <= zfs_zevent_retain_expire_secs) | |
257 | break; | |
258 | ||
259 | /* remove expired node */ | |
260 | avl_remove(&recent_events_tree, entry); | |
261 | list_remove(&recent_events_list, entry); | |
262 | kmem_free(entry, sizeof (*entry)); | |
263 | } | |
264 | ||
265 | /* Restart the cleaner if more entries remain */ | |
266 | recent_events_cleaner_tqid = 0; | |
267 | if (!list_is_empty(&recent_events_list)) | |
268 | zfs_ereport_schedule_cleaner(); | |
269 | ||
270 | mutex_exit(&recent_events_lock); | |
271 | } | |
272 | ||
273 | static void | |
274 | zfs_ereport_schedule_cleaner(void) | |
275 | { | |
276 | ASSERT(MUTEX_HELD(&recent_events_lock)); | |
277 | ||
278 | uint64_t timeout = SEC2NSEC(zfs_zevent_retain_expire_secs + 1); | |
279 | ||
280 | recent_events_cleaner_tqid = taskq_dispatch_delay( | |
281 | system_delay_taskq, zfs_ereport_cleaner, NULL, TQ_SLEEP, | |
282 | ddi_get_lbolt() + NSEC_TO_TICK(timeout)); | |
283 | } | |
284 | ||
03e02e5b DB |
285 | /* |
286 | * Clear entries for a given vdev or all vdevs in a pool when vdev == NULL | |
287 | */ | |
288 | void | |
289 | zfs_ereport_clear(spa_t *spa, vdev_t *vd) | |
290 | { | |
291 | uint64_t vdev_guid, pool_guid; | |
03e02e5b DB |
292 | |
293 | ASSERT(vd != NULL || spa != NULL); | |
294 | if (vd == NULL) { | |
295 | vdev_guid = 0; | |
296 | pool_guid = spa_guid(spa); | |
297 | } else { | |
298 | vdev_guid = vd->vdev_guid; | |
299 | pool_guid = 0; | |
300 | } | |
301 | ||
302 | mutex_enter(&recent_events_lock); | |
303 | ||
304 | recent_events_node_t *next = list_head(&recent_events_list); | |
305 | while (next != NULL) { | |
306 | recent_events_node_t *entry = next; | |
307 | ||
308 | next = list_next(&recent_events_list, next); | |
309 | ||
310 | if (entry->re_vdev_guid == vdev_guid || | |
311 | entry->re_pool_guid == pool_guid) { | |
312 | avl_remove(&recent_events_tree, entry); | |
313 | list_remove(&recent_events_list, entry); | |
314 | kmem_free(entry, sizeof (*entry)); | |
03e02e5b DB |
315 | } |
316 | } | |
317 | ||
318 | mutex_exit(&recent_events_lock); | |
319 | } | |
320 | ||
4f072827 DB |
321 | /* |
322 | * Check if an ereport would be a duplicate of one recently posted. | |
323 | * | |
324 | * An ereport is considered a duplicate if the set of criteria in | |
325 | * recent_events_node_t all match. | |
326 | * | |
327 | * Only FM_EREPORT_ZFS_IO, FM_EREPORT_ZFS_DATA, and FM_EREPORT_ZFS_CHECKSUM | |
328 | * are candidates for duplicate checking. | |
329 | */ | |
330 | static boolean_t | |
331 | zfs_ereport_is_duplicate(const char *subclass, spa_t *spa, vdev_t *vd, | |
332 | const zbookmark_phys_t *zb, zio_t *zio, uint64_t offset, uint64_t size) | |
333 | { | |
334 | recent_events_node_t search = {0}, *entry; | |
335 | ||
336 | if (vd == NULL || zio == NULL) | |
337 | return (B_FALSE); | |
338 | ||
339 | if (zfs_zevent_retain_max == 0) | |
340 | return (B_FALSE); | |
341 | ||
342 | if (strcmp(subclass, FM_EREPORT_ZFS_IO) == 0) | |
343 | search.re_subclass = ZSC_IO; | |
344 | else if (strcmp(subclass, FM_EREPORT_ZFS_DATA) == 0) | |
345 | search.re_subclass = ZSC_DATA; | |
346 | else if (strcmp(subclass, FM_EREPORT_ZFS_CHECKSUM) == 0) | |
347 | search.re_subclass = ZSC_CHECKSUM; | |
348 | else | |
349 | return (B_FALSE); | |
350 | ||
351 | search.re_pool_guid = spa_guid(spa); | |
352 | search.re_vdev_guid = vd->vdev_guid; | |
353 | search.re_io_error = zio->io_error; | |
354 | search.re_io_priority = zio->io_priority; | |
355 | /* if size is supplied use it over what's in zio */ | |
356 | if (size) { | |
357 | search.re_io_size = size; | |
358 | search.re_io_offset = offset; | |
359 | } else { | |
360 | search.re_io_size = zio->io_size; | |
361 | search.re_io_offset = zio->io_offset; | |
362 | } | |
363 | ||
364 | /* grab optional logical zio criteria */ | |
365 | if (zb != NULL) { | |
366 | search.re_io_bookmark.zb_objset = zb->zb_objset; | |
367 | search.re_io_bookmark.zb_object = zb->zb_object; | |
368 | search.re_io_bookmark.zb_level = zb->zb_level; | |
369 | search.re_io_bookmark.zb_blkid = zb->zb_blkid; | |
370 | } | |
371 | ||
372 | uint64_t now = gethrtime(); | |
373 | ||
374 | mutex_enter(&recent_events_lock); | |
375 | ||
376 | /* check if we have seen this one recently */ | |
377 | entry = avl_find(&recent_events_tree, &search, NULL); | |
378 | if (entry != NULL) { | |
379 | uint64_t age = NSEC2SEC(now - entry->re_timestamp); | |
380 | ||
381 | /* | |
382 | * There is still an active cleaner (since we're here). | |
383 | * Reset the last seen time for this duplicate entry | |
384 | * so that its lifespand gets extended. | |
385 | */ | |
386 | list_remove(&recent_events_list, entry); | |
387 | list_insert_head(&recent_events_list, entry); | |
388 | entry->re_timestamp = now; | |
389 | ||
390 | zfs_zevent_track_duplicate(); | |
391 | mutex_exit(&recent_events_lock); | |
392 | ||
393 | return (age <= zfs_zevent_retain_expire_secs); | |
394 | } | |
395 | ||
396 | if (avl_numnodes(&recent_events_tree) >= zfs_zevent_retain_max) { | |
397 | /* recycle oldest node */ | |
398 | entry = list_tail(&recent_events_list); | |
399 | ASSERT(entry != NULL); | |
400 | list_remove(&recent_events_list, entry); | |
401 | avl_remove(&recent_events_tree, entry); | |
402 | } else { | |
403 | entry = kmem_alloc(sizeof (recent_events_node_t), KM_SLEEP); | |
404 | } | |
405 | ||
406 | /* record this as a recent ereport */ | |
407 | *entry = search; | |
408 | avl_add(&recent_events_tree, entry); | |
409 | list_insert_head(&recent_events_list, entry); | |
410 | entry->re_timestamp = now; | |
411 | ||
412 | /* Start a cleaner if not already scheduled */ | |
413 | if (recent_events_cleaner_tqid == 0) | |
414 | zfs_ereport_schedule_cleaner(); | |
415 | ||
416 | mutex_exit(&recent_events_lock); | |
417 | return (B_FALSE); | |
418 | } | |
419 | ||
12fa0466 | 420 | void |
26685276 BB |
421 | zfs_zevent_post_cb(nvlist_t *nvl, nvlist_t *detector) |
422 | { | |
423 | if (nvl) | |
424 | fm_nvlist_destroy(nvl, FM_NVA_FREE); | |
425 | ||
426 | if (detector) | |
427 | fm_nvlist_destroy(detector, FM_NVA_FREE); | |
428 | } | |
429 | ||
6078881a | 430 | /* |
e778b048 RM |
431 | * We want to rate limit ZIO delay, deadman, and checksum events so as to not |
432 | * flood zevent consumers when a disk is acting up. | |
6078881a TH |
433 | * |
434 | * Returns 1 if we're ratelimiting, 0 if not. | |
435 | */ | |
436 | static int | |
437 | zfs_is_ratelimiting_event(const char *subclass, vdev_t *vd) | |
438 | { | |
439 | int rc = 0; | |
440 | /* | |
e778b048 | 441 | * zfs_ratelimit() returns 1 if we're *not* ratelimiting and 0 if we |
6078881a TH |
442 | * are. Invert it to get our return value. |
443 | */ | |
444 | if (strcmp(subclass, FM_EREPORT_ZFS_DELAY) == 0) { | |
445 | rc = !zfs_ratelimit(&vd->vdev_delay_rl); | |
e778b048 RM |
446 | } else if (strcmp(subclass, FM_EREPORT_ZFS_DEADMAN) == 0) { |
447 | rc = !zfs_ratelimit(&vd->vdev_deadman_rl); | |
6078881a TH |
448 | } else if (strcmp(subclass, FM_EREPORT_ZFS_CHECKSUM) == 0) { |
449 | rc = !zfs_ratelimit(&vd->vdev_checksum_rl); | |
450 | } | |
451 | ||
452 | if (rc) { | |
453 | /* We're rate limiting */ | |
454 | fm_erpt_dropped_increment(); | |
455 | } | |
456 | ||
457 | return (rc); | |
458 | } | |
0426c168 | 459 | |
ad796b8a TH |
460 | /* |
461 | * Return B_TRUE if the event actually posted, B_FALSE if not. | |
462 | */ | |
463 | static boolean_t | |
428870ff | 464 | zfs_ereport_start(nvlist_t **ereport_out, nvlist_t **detector_out, |
a2c2ed1b | 465 | const char *subclass, spa_t *spa, vdev_t *vd, const zbookmark_phys_t *zb, |
b5256303 | 466 | zio_t *zio, uint64_t stateoroffset, uint64_t size) |
34dc7c2f | 467 | { |
34dc7c2f | 468 | nvlist_t *ereport, *detector; |
428870ff | 469 | |
34dc7c2f BB |
470 | uint64_t ena; |
471 | char class[64]; | |
472 | ||
34dc7c2f | 473 | if ((ereport = fm_nvlist_create(NULL)) == NULL) |
ad796b8a | 474 | return (B_FALSE); |
34dc7c2f BB |
475 | |
476 | if ((detector = fm_nvlist_create(NULL)) == NULL) { | |
477 | fm_nvlist_destroy(ereport, FM_NVA_FREE); | |
ad796b8a | 478 | return (B_FALSE); |
34dc7c2f BB |
479 | } |
480 | ||
481 | /* | |
482 | * Serialize ereport generation | |
483 | */ | |
484 | mutex_enter(&spa->spa_errlist_lock); | |
485 | ||
486 | /* | |
487 | * Determine the ENA to use for this event. If we are in a loading | |
488 | * state, use a SPA-wide ENA. Otherwise, if we are in an I/O state, use | |
489 | * a root zio-wide ENA. Otherwise, simply use a unique ENA. | |
490 | */ | |
428870ff | 491 | if (spa_load_state(spa) != SPA_LOAD_NONE) { |
34dc7c2f BB |
492 | if (spa->spa_ena == 0) |
493 | spa->spa_ena = fm_ena_generate(0, FM_ENA_FMT1); | |
494 | ena = spa->spa_ena; | |
495 | } else if (zio != NULL && zio->io_logical != NULL) { | |
496 | if (zio->io_logical->io_ena == 0) | |
497 | zio->io_logical->io_ena = | |
498 | fm_ena_generate(0, FM_ENA_FMT1); | |
499 | ena = zio->io_logical->io_ena; | |
500 | } else { | |
501 | ena = fm_ena_generate(0, FM_ENA_FMT1); | |
502 | } | |
503 | ||
504 | /* | |
505 | * Construct the full class, detector, and other standard FMA fields. | |
506 | */ | |
507 | (void) snprintf(class, sizeof (class), "%s.%s", | |
508 | ZFS_ERROR_CLASS, subclass); | |
509 | ||
510 | fm_fmri_zfs_set(detector, FM_ZFS_SCHEME_VERSION, spa_guid(spa), | |
511 | vd != NULL ? vd->vdev_guid : 0); | |
512 | ||
513 | fm_ereport_set(ereport, FM_EREPORT_VERSION, class, ena, detector, NULL); | |
514 | ||
515 | /* | |
516 | * Construct the per-ereport payload, depending on which parameters are | |
517 | * passed in. | |
518 | */ | |
519 | ||
520 | /* | |
521 | * Generic payload members common to all ereports. | |
34dc7c2f | 522 | */ |
bcdb96a3 C |
523 | fm_payload_set(ereport, |
524 | FM_EREPORT_PAYLOAD_ZFS_POOL, DATA_TYPE_STRING, spa_name(spa), | |
525 | FM_EREPORT_PAYLOAD_ZFS_POOL_GUID, DATA_TYPE_UINT64, spa_guid(spa), | |
177c91d0 DB |
526 | FM_EREPORT_PAYLOAD_ZFS_POOL_STATE, DATA_TYPE_UINT64, |
527 | (uint64_t)spa_state(spa), | |
34dc7c2f | 528 | FM_EREPORT_PAYLOAD_ZFS_POOL_CONTEXT, DATA_TYPE_INT32, |
177c91d0 | 529 | (int32_t)spa_load_state(spa), NULL); |
b128c09f | 530 | |
a36cc8d2 | 531 | fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_POOL_FAILMODE, |
532 | DATA_TYPE_STRING, | |
533 | spa_get_failmode(spa) == ZIO_FAILURE_MODE_WAIT ? | |
534 | FM_EREPORT_FAILMODE_WAIT : | |
535 | spa_get_failmode(spa) == ZIO_FAILURE_MODE_CONTINUE ? | |
536 | FM_EREPORT_FAILMODE_CONTINUE : FM_EREPORT_FAILMODE_PANIC, | |
537 | NULL); | |
34dc7c2f BB |
538 | |
539 | if (vd != NULL) { | |
540 | vdev_t *pvd = vd->vdev_parent; | |
cc92e9d0 | 541 | vdev_queue_t *vq = &vd->vdev_queue; |
904ea276 BB |
542 | vdev_stat_t *vs = &vd->vdev_stat; |
543 | vdev_t *spare_vd; | |
544 | uint64_t *spare_guids; | |
545 | char **spare_paths; | |
546 | int i, spare_count; | |
34dc7c2f BB |
547 | |
548 | fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_VDEV_GUID, | |
549 | DATA_TYPE_UINT64, vd->vdev_guid, | |
550 | FM_EREPORT_PAYLOAD_ZFS_VDEV_TYPE, | |
551 | DATA_TYPE_STRING, vd->vdev_ops->vdev_op_type, NULL); | |
9babb374 | 552 | if (vd->vdev_path != NULL) |
34dc7c2f BB |
553 | fm_payload_set(ereport, |
554 | FM_EREPORT_PAYLOAD_ZFS_VDEV_PATH, | |
555 | DATA_TYPE_STRING, vd->vdev_path, NULL); | |
9babb374 | 556 | if (vd->vdev_devid != NULL) |
34dc7c2f BB |
557 | fm_payload_set(ereport, |
558 | FM_EREPORT_PAYLOAD_ZFS_VDEV_DEVID, | |
559 | DATA_TYPE_STRING, vd->vdev_devid, NULL); | |
9babb374 BB |
560 | if (vd->vdev_fru != NULL) |
561 | fm_payload_set(ereport, | |
562 | FM_EREPORT_PAYLOAD_ZFS_VDEV_FRU, | |
563 | DATA_TYPE_STRING, vd->vdev_fru, NULL); | |
6568379e TH |
564 | if (vd->vdev_enc_sysfs_path != NULL) |
565 | fm_payload_set(ereport, | |
566 | FM_EREPORT_PAYLOAD_ZFS_VDEV_ENC_SYSFS_PATH, | |
567 | DATA_TYPE_STRING, vd->vdev_enc_sysfs_path, NULL); | |
32a9872b GW |
568 | if (vd->vdev_ashift) |
569 | fm_payload_set(ereport, | |
570 | FM_EREPORT_PAYLOAD_ZFS_VDEV_ASHIFT, | |
571 | DATA_TYPE_UINT64, vd->vdev_ashift, NULL); | |
34dc7c2f | 572 | |
cc92e9d0 GW |
573 | if (vq != NULL) { |
574 | fm_payload_set(ereport, | |
575 | FM_EREPORT_PAYLOAD_ZFS_VDEV_COMP_TS, | |
576 | DATA_TYPE_UINT64, vq->vq_io_complete_ts, NULL); | |
577 | fm_payload_set(ereport, | |
578 | FM_EREPORT_PAYLOAD_ZFS_VDEV_DELTA_TS, | |
579 | DATA_TYPE_UINT64, vq->vq_io_delta_ts, NULL); | |
580 | } | |
581 | ||
904ea276 BB |
582 | if (vs != NULL) { |
583 | fm_payload_set(ereport, | |
584 | FM_EREPORT_PAYLOAD_ZFS_VDEV_READ_ERRORS, | |
585 | DATA_TYPE_UINT64, vs->vs_read_errors, | |
586 | FM_EREPORT_PAYLOAD_ZFS_VDEV_WRITE_ERRORS, | |
587 | DATA_TYPE_UINT64, vs->vs_write_errors, | |
588 | FM_EREPORT_PAYLOAD_ZFS_VDEV_CKSUM_ERRORS, | |
ad796b8a TH |
589 | DATA_TYPE_UINT64, vs->vs_checksum_errors, |
590 | FM_EREPORT_PAYLOAD_ZFS_VDEV_DELAYS, | |
591 | DATA_TYPE_UINT64, vs->vs_slow_ios, | |
592 | NULL); | |
904ea276 BB |
593 | } |
594 | ||
34dc7c2f BB |
595 | if (pvd != NULL) { |
596 | fm_payload_set(ereport, | |
597 | FM_EREPORT_PAYLOAD_ZFS_PARENT_GUID, | |
598 | DATA_TYPE_UINT64, pvd->vdev_guid, | |
599 | FM_EREPORT_PAYLOAD_ZFS_PARENT_TYPE, | |
600 | DATA_TYPE_STRING, pvd->vdev_ops->vdev_op_type, | |
601 | NULL); | |
602 | if (pvd->vdev_path) | |
603 | fm_payload_set(ereport, | |
604 | FM_EREPORT_PAYLOAD_ZFS_PARENT_PATH, | |
605 | DATA_TYPE_STRING, pvd->vdev_path, NULL); | |
606 | if (pvd->vdev_devid) | |
607 | fm_payload_set(ereport, | |
608 | FM_EREPORT_PAYLOAD_ZFS_PARENT_DEVID, | |
609 | DATA_TYPE_STRING, pvd->vdev_devid, NULL); | |
610 | } | |
904ea276 BB |
611 | |
612 | spare_count = spa->spa_spares.sav_count; | |
613 | spare_paths = kmem_zalloc(sizeof (char *) * spare_count, | |
79c76d5b | 614 | KM_SLEEP); |
904ea276 | 615 | spare_guids = kmem_zalloc(sizeof (uint64_t) * spare_count, |
79c76d5b | 616 | KM_SLEEP); |
904ea276 BB |
617 | |
618 | for (i = 0; i < spare_count; i++) { | |
619 | spare_vd = spa->spa_spares.sav_vdevs[i]; | |
620 | if (spare_vd) { | |
621 | spare_paths[i] = spare_vd->vdev_path; | |
622 | spare_guids[i] = spare_vd->vdev_guid; | |
623 | } | |
624 | } | |
625 | ||
626 | fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_VDEV_SPARE_PATHS, | |
627 | DATA_TYPE_STRING_ARRAY, spare_count, spare_paths, | |
628 | FM_EREPORT_PAYLOAD_ZFS_VDEV_SPARE_GUIDS, | |
629 | DATA_TYPE_UINT64_ARRAY, spare_count, spare_guids, NULL); | |
630 | ||
631 | kmem_free(spare_guids, sizeof (uint64_t) * spare_count); | |
632 | kmem_free(spare_paths, sizeof (char *) * spare_count); | |
34dc7c2f BB |
633 | } |
634 | ||
635 | if (zio != NULL) { | |
636 | /* | |
637 | * Payload common to all I/Os. | |
638 | */ | |
639 | fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_ZIO_ERR, | |
640 | DATA_TYPE_INT32, zio->io_error, NULL); | |
312c07ed BB |
641 | fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_ZIO_FLAGS, |
642 | DATA_TYPE_INT32, zio->io_flags, NULL); | |
9dcb9719 BB |
643 | fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_ZIO_STAGE, |
644 | DATA_TYPE_UINT32, zio->io_stage, NULL); | |
645 | fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_ZIO_PIPELINE, | |
646 | DATA_TYPE_UINT32, zio->io_pipeline, NULL); | |
a69052be BB |
647 | fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_ZIO_DELAY, |
648 | DATA_TYPE_UINT64, zio->io_delay, NULL); | |
cc92e9d0 GW |
649 | fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_ZIO_TIMESTAMP, |
650 | DATA_TYPE_UINT64, zio->io_timestamp, NULL); | |
cc92e9d0 GW |
651 | fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_ZIO_DELTA, |
652 | DATA_TYPE_UINT64, zio->io_delta, NULL); | |
4f072827 DB |
653 | fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_ZIO_PRIORITY, |
654 | DATA_TYPE_UINT32, zio->io_priority, NULL); | |
34dc7c2f BB |
655 | |
656 | /* | |
657 | * If the 'size' parameter is non-zero, it indicates this is a | |
658 | * RAID-Z or other I/O where the physical offset and length are | |
659 | * provided for us, instead of within the zio_t. | |
660 | */ | |
661 | if (vd != NULL) { | |
662 | if (size) | |
663 | fm_payload_set(ereport, | |
664 | FM_EREPORT_PAYLOAD_ZFS_ZIO_OFFSET, | |
665 | DATA_TYPE_UINT64, stateoroffset, | |
666 | FM_EREPORT_PAYLOAD_ZFS_ZIO_SIZE, | |
667 | DATA_TYPE_UINT64, size, NULL); | |
668 | else | |
669 | fm_payload_set(ereport, | |
670 | FM_EREPORT_PAYLOAD_ZFS_ZIO_OFFSET, | |
671 | DATA_TYPE_UINT64, zio->io_offset, | |
672 | FM_EREPORT_PAYLOAD_ZFS_ZIO_SIZE, | |
673 | DATA_TYPE_UINT64, zio->io_size, NULL); | |
674 | } | |
34dc7c2f BB |
675 | } else if (vd != NULL) { |
676 | /* | |
677 | * If we have a vdev but no zio, this is a device fault, and the | |
678 | * 'stateoroffset' parameter indicates the previous state of the | |
679 | * vdev. | |
680 | */ | |
681 | fm_payload_set(ereport, | |
682 | FM_EREPORT_PAYLOAD_ZFS_PREV_STATE, | |
683 | DATA_TYPE_UINT64, stateoroffset, NULL); | |
684 | } | |
428870ff | 685 | |
b5256303 TC |
686 | /* |
687 | * Payload for I/Os with corresponding logical information. | |
688 | */ | |
ad796b8a | 689 | if (zb != NULL && (zio == NULL || zio->io_logical != NULL)) { |
b5256303 TC |
690 | fm_payload_set(ereport, |
691 | FM_EREPORT_PAYLOAD_ZFS_ZIO_OBJSET, | |
692 | DATA_TYPE_UINT64, zb->zb_objset, | |
693 | FM_EREPORT_PAYLOAD_ZFS_ZIO_OBJECT, | |
694 | DATA_TYPE_UINT64, zb->zb_object, | |
695 | FM_EREPORT_PAYLOAD_ZFS_ZIO_LEVEL, | |
696 | DATA_TYPE_INT64, zb->zb_level, | |
697 | FM_EREPORT_PAYLOAD_ZFS_ZIO_BLKID, | |
698 | DATA_TYPE_UINT64, zb->zb_blkid, NULL); | |
ad796b8a | 699 | } |
b5256303 | 700 | |
69f024a5 RW |
701 | /* |
702 | * Payload for tuning the zed | |
703 | */ | |
704 | if (vd != NULL && strcmp(subclass, FM_EREPORT_ZFS_CHECKSUM) == 0) { | |
705 | uint64_t cksum_n, cksum_t; | |
706 | ||
707 | cksum_n = vdev_prop_get_inherited(vd, VDEV_PROP_CHECKSUM_N); | |
708 | if (cksum_n != vdev_prop_default_numeric(VDEV_PROP_CHECKSUM_N)) | |
709 | fm_payload_set(ereport, | |
710 | FM_EREPORT_PAYLOAD_ZFS_VDEV_CKSUM_N, | |
711 | DATA_TYPE_UINT64, | |
712 | cksum_n, | |
713 | NULL); | |
714 | ||
715 | cksum_t = vdev_prop_get_inherited(vd, VDEV_PROP_CHECKSUM_T); | |
716 | if (cksum_t != vdev_prop_default_numeric(VDEV_PROP_CHECKSUM_T)) | |
717 | fm_payload_set(ereport, | |
718 | FM_EREPORT_PAYLOAD_ZFS_VDEV_CKSUM_T, | |
719 | DATA_TYPE_UINT64, | |
720 | cksum_t, | |
721 | NULL); | |
722 | } | |
723 | ||
724 | if (vd != NULL && strcmp(subclass, FM_EREPORT_ZFS_IO) == 0) { | |
725 | uint64_t io_n, io_t; | |
726 | ||
727 | io_n = vdev_prop_get_inherited(vd, VDEV_PROP_IO_N); | |
728 | if (io_n != vdev_prop_default_numeric(VDEV_PROP_IO_N)) | |
729 | fm_payload_set(ereport, | |
730 | FM_EREPORT_PAYLOAD_ZFS_VDEV_IO_N, | |
731 | DATA_TYPE_UINT64, | |
732 | io_n, | |
733 | NULL); | |
734 | ||
735 | io_t = vdev_prop_get_inherited(vd, VDEV_PROP_IO_T); | |
736 | if (io_t != vdev_prop_default_numeric(VDEV_PROP_IO_T)) | |
737 | fm_payload_set(ereport, | |
738 | FM_EREPORT_PAYLOAD_ZFS_VDEV_IO_T, | |
739 | DATA_TYPE_UINT64, | |
740 | io_t, | |
741 | NULL); | |
742 | } | |
743 | ||
34dc7c2f BB |
744 | mutex_exit(&spa->spa_errlist_lock); |
745 | ||
428870ff BB |
746 | *ereport_out = ereport; |
747 | *detector_out = detector; | |
ad796b8a | 748 | return (B_TRUE); |
428870ff BB |
749 | } |
750 | ||
751 | /* if it's <= 128 bytes, save the corruption directly */ | |
752 | #define ZFM_MAX_INLINE (128 / sizeof (uint64_t)) | |
753 | ||
754 | #define MAX_RANGES 16 | |
755 | ||
756 | typedef struct zfs_ecksum_info { | |
428870ff BB |
757 | /* inline arrays of bits set and cleared. */ |
758 | uint64_t zei_bits_set[ZFM_MAX_INLINE]; | |
759 | uint64_t zei_bits_cleared[ZFM_MAX_INLINE]; | |
760 | ||
761 | /* | |
762 | * for each range, the number of bits set and cleared. The Hamming | |
763 | * distance between the good and bad buffers is the sum of them all. | |
764 | */ | |
765 | uint32_t zei_range_sets[MAX_RANGES]; | |
766 | uint32_t zei_range_clears[MAX_RANGES]; | |
767 | ||
768 | struct zei_ranges { | |
769 | uint32_t zr_start; | |
770 | uint32_t zr_end; | |
771 | } zei_ranges[MAX_RANGES]; | |
772 | ||
773 | size_t zei_range_count; | |
774 | uint32_t zei_mingap; | |
775 | uint32_t zei_allowed_mingap; | |
776 | ||
777 | } zfs_ecksum_info_t; | |
778 | ||
779 | static void | |
cf2a225b | 780 | update_bad_bits(uint64_t value_arg, uint32_t *count) |
428870ff BB |
781 | { |
782 | size_t i; | |
783 | size_t bits = 0; | |
784 | uint64_t value = BE_64(value_arg); | |
785 | ||
786 | /* We store the bits in big-endian (largest-first) order */ | |
787 | for (i = 0; i < 64; i++) { | |
cf2a225b | 788 | if (value & (1ull << i)) |
428870ff | 789 | ++bits; |
428870ff BB |
790 | } |
791 | /* update the count of bits changed */ | |
792 | *count += bits; | |
793 | } | |
794 | ||
795 | /* | |
796 | * We've now filled up the range array, and need to increase "mingap" and | |
797 | * shrink the range list accordingly. zei_mingap is always the smallest | |
798 | * distance between array entries, so we set the new_allowed_gap to be | |
799 | * one greater than that. We then go through the list, joining together | |
800 | * any ranges which are closer than the new_allowed_gap. | |
801 | * | |
802 | * By construction, there will be at least one. We also update zei_mingap | |
803 | * to the new smallest gap, to prepare for our next invocation. | |
804 | */ | |
805 | static void | |
26685276 | 806 | zei_shrink_ranges(zfs_ecksum_info_t *eip) |
428870ff BB |
807 | { |
808 | uint32_t mingap = UINT32_MAX; | |
809 | uint32_t new_allowed_gap = eip->zei_mingap + 1; | |
810 | ||
811 | size_t idx, output; | |
812 | size_t max = eip->zei_range_count; | |
813 | ||
814 | struct zei_ranges *r = eip->zei_ranges; | |
815 | ||
816 | ASSERT3U(eip->zei_range_count, >, 0); | |
817 | ASSERT3U(eip->zei_range_count, <=, MAX_RANGES); | |
818 | ||
819 | output = idx = 0; | |
820 | while (idx < max - 1) { | |
821 | uint32_t start = r[idx].zr_start; | |
822 | uint32_t end = r[idx].zr_end; | |
823 | ||
824 | while (idx < max - 1) { | |
26685276 | 825 | idx++; |
428870ff | 826 | |
1c27024e DB |
827 | uint32_t nstart = r[idx].zr_start; |
828 | uint32_t nend = r[idx].zr_end; | |
829 | ||
830 | uint32_t gap = nstart - end; | |
428870ff BB |
831 | if (gap < new_allowed_gap) { |
832 | end = nend; | |
833 | continue; | |
834 | } | |
835 | if (gap < mingap) | |
836 | mingap = gap; | |
837 | break; | |
838 | } | |
839 | r[output].zr_start = start; | |
840 | r[output].zr_end = end; | |
841 | output++; | |
842 | } | |
843 | ASSERT3U(output, <, eip->zei_range_count); | |
844 | eip->zei_range_count = output; | |
845 | eip->zei_mingap = mingap; | |
846 | eip->zei_allowed_mingap = new_allowed_gap; | |
847 | } | |
848 | ||
849 | static void | |
26685276 | 850 | zei_add_range(zfs_ecksum_info_t *eip, int start, int end) |
428870ff BB |
851 | { |
852 | struct zei_ranges *r = eip->zei_ranges; | |
853 | size_t count = eip->zei_range_count; | |
854 | ||
855 | if (count >= MAX_RANGES) { | |
26685276 | 856 | zei_shrink_ranges(eip); |
428870ff BB |
857 | count = eip->zei_range_count; |
858 | } | |
859 | if (count == 0) { | |
860 | eip->zei_mingap = UINT32_MAX; | |
861 | eip->zei_allowed_mingap = 1; | |
862 | } else { | |
863 | int gap = start - r[count - 1].zr_end; | |
864 | ||
865 | if (gap < eip->zei_allowed_mingap) { | |
866 | r[count - 1].zr_end = end; | |
867 | return; | |
868 | } | |
869 | if (gap < eip->zei_mingap) | |
870 | eip->zei_mingap = gap; | |
871 | } | |
872 | r[count].zr_start = start; | |
873 | r[count].zr_end = end; | |
874 | eip->zei_range_count++; | |
875 | } | |
876 | ||
877 | static size_t | |
26685276 | 878 | zei_range_total_size(zfs_ecksum_info_t *eip) |
428870ff BB |
879 | { |
880 | struct zei_ranges *r = eip->zei_ranges; | |
881 | size_t count = eip->zei_range_count; | |
882 | size_t result = 0; | |
883 | size_t idx; | |
884 | ||
885 | for (idx = 0; idx < count; idx++) | |
886 | result += (r[idx].zr_end - r[idx].zr_start); | |
887 | ||
888 | return (result); | |
889 | } | |
890 | ||
891 | static zfs_ecksum_info_t * | |
892 | annotate_ecksum(nvlist_t *ereport, zio_bad_cksum_t *info, | |
84c07ada | 893 | const abd_t *goodabd, const abd_t *badabd, size_t size, |
428870ff BB |
894 | boolean_t drop_if_identical) |
895 | { | |
84c07ada GN |
896 | const uint64_t *good; |
897 | const uint64_t *bad; | |
428870ff | 898 | |
428870ff BB |
899 | size_t nui64s = size / sizeof (uint64_t); |
900 | ||
901 | size_t inline_size; | |
902 | int no_inline = 0; | |
903 | size_t idx; | |
904 | size_t range; | |
905 | ||
906 | size_t offset = 0; | |
907 | ssize_t start = -1; | |
908 | ||
79c76d5b | 909 | zfs_ecksum_info_t *eip = kmem_zalloc(sizeof (*eip), KM_SLEEP); |
428870ff BB |
910 | |
911 | /* don't do any annotation for injected checksum errors */ | |
912 | if (info != NULL && info->zbc_injected) | |
913 | return (eip); | |
914 | ||
915 | if (info != NULL && info->zbc_has_cksum) { | |
916 | fm_payload_set(ereport, | |
428870ff BB |
917 | FM_EREPORT_PAYLOAD_ZFS_CKSUM_ALGO, |
918 | DATA_TYPE_STRING, | |
919 | info->zbc_checksum_name, | |
920 | NULL); | |
921 | ||
922 | if (info->zbc_byteswapped) { | |
923 | fm_payload_set(ereport, | |
924 | FM_EREPORT_PAYLOAD_ZFS_CKSUM_BYTESWAP, | |
925 | DATA_TYPE_BOOLEAN, 1, | |
926 | NULL); | |
927 | } | |
928 | } | |
929 | ||
84c07ada | 930 | if (badabd == NULL || goodabd == NULL) |
428870ff BB |
931 | return (eip); |
932 | ||
1b18c6d7 | 933 | ASSERT3U(nui64s, <=, UINT32_MAX); |
428870ff BB |
934 | ASSERT3U(size, ==, nui64s * sizeof (uint64_t)); |
935 | ASSERT3U(size, <=, SPA_MAXBLOCKSIZE); | |
936 | ASSERT3U(size, <=, UINT32_MAX); | |
937 | ||
84c07ada GN |
938 | good = (const uint64_t *) abd_borrow_buf_copy((abd_t *)goodabd, size); |
939 | bad = (const uint64_t *) abd_borrow_buf_copy((abd_t *)badabd, size); | |
940 | ||
428870ff BB |
941 | /* build up the range list by comparing the two buffers. */ |
942 | for (idx = 0; idx < nui64s; idx++) { | |
943 | if (good[idx] == bad[idx]) { | |
944 | if (start == -1) | |
945 | continue; | |
946 | ||
26685276 | 947 | zei_add_range(eip, start, idx); |
428870ff BB |
948 | start = -1; |
949 | } else { | |
950 | if (start != -1) | |
951 | continue; | |
952 | ||
953 | start = idx; | |
954 | } | |
955 | } | |
956 | if (start != -1) | |
26685276 | 957 | zei_add_range(eip, start, idx); |
428870ff BB |
958 | |
959 | /* See if it will fit in our inline buffers */ | |
26685276 | 960 | inline_size = zei_range_total_size(eip); |
428870ff BB |
961 | if (inline_size > ZFM_MAX_INLINE) |
962 | no_inline = 1; | |
963 | ||
964 | /* | |
965 | * If there is no change and we want to drop if the buffers are | |
966 | * identical, do so. | |
967 | */ | |
968 | if (inline_size == 0 && drop_if_identical) { | |
969 | kmem_free(eip, sizeof (*eip)); | |
84c07ada GN |
970 | abd_return_buf((abd_t *)goodabd, (void *)good, size); |
971 | abd_return_buf((abd_t *)badabd, (void *)bad, size); | |
428870ff BB |
972 | return (NULL); |
973 | } | |
974 | ||
975 | /* | |
976 | * Now walk through the ranges, filling in the details of the | |
977 | * differences. Also convert our uint64_t-array offsets to byte | |
978 | * offsets. | |
979 | */ | |
980 | for (range = 0; range < eip->zei_range_count; range++) { | |
981 | size_t start = eip->zei_ranges[range].zr_start; | |
982 | size_t end = eip->zei_ranges[range].zr_end; | |
983 | ||
984 | for (idx = start; idx < end; idx++) { | |
985 | uint64_t set, cleared; | |
986 | ||
987 | // bits set in bad, but not in good | |
988 | set = ((~good[idx]) & bad[idx]); | |
989 | // bits set in good, but not in bad | |
990 | cleared = (good[idx] & (~bad[idx])); | |
991 | ||
428870ff BB |
992 | if (!no_inline) { |
993 | ASSERT3U(offset, <, inline_size); | |
994 | eip->zei_bits_set[offset] = set; | |
995 | eip->zei_bits_cleared[offset] = cleared; | |
996 | offset++; | |
997 | } | |
998 | ||
cf2a225b AS |
999 | update_bad_bits(set, &eip->zei_range_sets[range]); |
1000 | update_bad_bits(cleared, &eip->zei_range_clears[range]); | |
428870ff BB |
1001 | } |
1002 | ||
1003 | /* convert to byte offsets */ | |
1004 | eip->zei_ranges[range].zr_start *= sizeof (uint64_t); | |
1005 | eip->zei_ranges[range].zr_end *= sizeof (uint64_t); | |
1006 | } | |
84c07ada GN |
1007 | |
1008 | abd_return_buf((abd_t *)goodabd, (void *)good, size); | |
1009 | abd_return_buf((abd_t *)badabd, (void *)bad, size); | |
1010 | ||
428870ff BB |
1011 | eip->zei_allowed_mingap *= sizeof (uint64_t); |
1012 | inline_size *= sizeof (uint64_t); | |
1013 | ||
1014 | /* fill in ereport */ | |
1015 | fm_payload_set(ereport, | |
1016 | FM_EREPORT_PAYLOAD_ZFS_BAD_OFFSET_RANGES, | |
1017 | DATA_TYPE_UINT32_ARRAY, 2 * eip->zei_range_count, | |
1018 | (uint32_t *)eip->zei_ranges, | |
1019 | FM_EREPORT_PAYLOAD_ZFS_BAD_RANGE_MIN_GAP, | |
1020 | DATA_TYPE_UINT32, eip->zei_allowed_mingap, | |
1021 | FM_EREPORT_PAYLOAD_ZFS_BAD_RANGE_SETS, | |
1022 | DATA_TYPE_UINT32_ARRAY, eip->zei_range_count, eip->zei_range_sets, | |
1023 | FM_EREPORT_PAYLOAD_ZFS_BAD_RANGE_CLEARS, | |
1024 | DATA_TYPE_UINT32_ARRAY, eip->zei_range_count, eip->zei_range_clears, | |
1025 | NULL); | |
1026 | ||
1027 | if (!no_inline) { | |
1028 | fm_payload_set(ereport, | |
1029 | FM_EREPORT_PAYLOAD_ZFS_BAD_SET_BITS, | |
1030 | DATA_TYPE_UINT8_ARRAY, | |
1031 | inline_size, (uint8_t *)eip->zei_bits_set, | |
1032 | FM_EREPORT_PAYLOAD_ZFS_BAD_CLEARED_BITS, | |
1033 | DATA_TYPE_UINT8_ARRAY, | |
1034 | inline_size, (uint8_t *)eip->zei_bits_cleared, | |
1035 | NULL); | |
428870ff BB |
1036 | } |
1037 | return (eip); | |
1038 | } | |
03e02e5b | 1039 | #else |
03e02e5b DB |
1040 | void |
1041 | zfs_ereport_clear(spa_t *spa, vdev_t *vd) | |
1042 | { | |
14e4e3cb | 1043 | (void) spa, (void) vd; |
03e02e5b | 1044 | } |
428870ff BB |
1045 | #endif |
1046 | ||
ad796b8a TH |
1047 | /* |
1048 | * Make sure our event is still valid for the given zio/vdev/pool. For example, | |
1049 | * we don't want to keep logging events for a faulted or missing vdev. | |
1050 | */ | |
1051 | boolean_t | |
1052 | zfs_ereport_is_valid(const char *subclass, spa_t *spa, vdev_t *vd, zio_t *zio) | |
1053 | { | |
1054 | #ifdef _KERNEL | |
1055 | /* | |
1056 | * If we are doing a spa_tryimport() or in recovery mode, | |
1057 | * ignore errors. | |
1058 | */ | |
1059 | if (spa_load_state(spa) == SPA_LOAD_TRYIMPORT || | |
1060 | spa_load_state(spa) == SPA_LOAD_RECOVER) | |
1061 | return (B_FALSE); | |
1062 | ||
1063 | /* | |
1064 | * If we are in the middle of opening a pool, and the previous attempt | |
1065 | * failed, don't bother logging any new ereports - we're just going to | |
1066 | * get the same diagnosis anyway. | |
1067 | */ | |
1068 | if (spa_load_state(spa) != SPA_LOAD_NONE && | |
1069 | spa->spa_last_open_failed) | |
1070 | return (B_FALSE); | |
1071 | ||
1072 | if (zio != NULL) { | |
1073 | /* | |
1074 | * If this is not a read or write zio, ignore the error. This | |
1075 | * can occur if the DKIOCFLUSHWRITECACHE ioctl fails. | |
1076 | */ | |
1077 | if (zio->io_type != ZIO_TYPE_READ && | |
1078 | zio->io_type != ZIO_TYPE_WRITE) | |
1079 | return (B_FALSE); | |
1080 | ||
1081 | if (vd != NULL) { | |
1082 | /* | |
1083 | * If the vdev has already been marked as failing due | |
1084 | * to a failed probe, then ignore any subsequent I/O | |
1085 | * errors, as the DE will automatically fault the vdev | |
1086 | * on the first such failure. This also catches cases | |
1087 | * where vdev_remove_wanted is set and the device has | |
1088 | * not yet been asynchronously placed into the REMOVED | |
1089 | * state. | |
1090 | */ | |
1091 | if (zio->io_vd == vd && !vdev_accessible(vd, zio)) | |
1092 | return (B_FALSE); | |
1093 | ||
1094 | /* | |
1095 | * Ignore checksum errors for reads from DTL regions of | |
1096 | * leaf vdevs. | |
1097 | */ | |
1098 | if (zio->io_type == ZIO_TYPE_READ && | |
1099 | zio->io_error == ECKSUM && | |
1100 | vd->vdev_ops->vdev_op_leaf && | |
1101 | vdev_dtl_contains(vd, DTL_MISSING, zio->io_txg, 1)) | |
1102 | return (B_FALSE); | |
1103 | } | |
1104 | } | |
1105 | ||
1106 | /* | |
1107 | * For probe failure, we want to avoid posting ereports if we've | |
1108 | * already removed the device in the meantime. | |
1109 | */ | |
1110 | if (vd != NULL && | |
1111 | strcmp(subclass, FM_EREPORT_ZFS_PROBE_FAILURE) == 0 && | |
1112 | (vd->vdev_remove_wanted || vd->vdev_state == VDEV_STATE_REMOVED)) | |
1113 | return (B_FALSE); | |
1114 | ||
1115 | /* Ignore bogus delay events (like from ioctls or unqueued IOs) */ | |
1116 | if ((strcmp(subclass, FM_EREPORT_ZFS_DELAY) == 0) && | |
1117 | (zio != NULL) && (!zio->io_timestamp)) { | |
1118 | return (B_FALSE); | |
1119 | } | |
14e4e3cb AZ |
1120 | #else |
1121 | (void) subclass, (void) spa, (void) vd, (void) zio; | |
ad796b8a TH |
1122 | #endif |
1123 | return (B_TRUE); | |
1124 | } | |
1125 | ||
1126 | /* | |
4f072827 DB |
1127 | * Post an ereport for the given subclass |
1128 | * | |
1129 | * Returns | |
1130 | * - 0 if an event was posted | |
1131 | * - EINVAL if there was a problem posting event | |
1132 | * - EBUSY if the event was rate limited | |
1133 | * - EALREADY if the event was already posted (duplicate) | |
ad796b8a TH |
1134 | */ |
1135 | int | |
b5256303 | 1136 | zfs_ereport_post(const char *subclass, spa_t *spa, vdev_t *vd, |
4f072827 | 1137 | const zbookmark_phys_t *zb, zio_t *zio, uint64_t state) |
428870ff | 1138 | { |
ad796b8a | 1139 | int rc = 0; |
428870ff BB |
1140 | #ifdef _KERNEL |
1141 | nvlist_t *ereport = NULL; | |
1142 | nvlist_t *detector = NULL; | |
1143 | ||
4f072827 DB |
1144 | if (!zfs_ereport_is_valid(subclass, spa, vd, zio)) |
1145 | return (EINVAL); | |
1146 | ||
1147 | if (zfs_ereport_is_duplicate(subclass, spa, vd, zb, zio, 0, 0)) | |
1148 | return (SET_ERROR(EALREADY)); | |
1149 | ||
17b43f96 | 1150 | if (zfs_is_ratelimiting_event(subclass, vd)) |
ad796b8a | 1151 | return (SET_ERROR(EBUSY)); |
17b43f96 | 1152 | |
ad796b8a | 1153 | if (!zfs_ereport_start(&ereport, &detector, subclass, spa, vd, |
4f072827 | 1154 | zb, zio, state, 0)) |
ad796b8a | 1155 | return (SET_ERROR(EINVAL)); /* couldn't post event */ |
428870ff BB |
1156 | |
1157 | if (ereport == NULL) | |
ad796b8a | 1158 | return (SET_ERROR(EINVAL)); |
428870ff | 1159 | |
26685276 | 1160 | /* Cleanup is handled by the callback function */ |
ad796b8a | 1161 | rc = zfs_zevent_post(ereport, detector, zfs_zevent_post_cb); |
14e4e3cb AZ |
1162 | #else |
1163 | (void) subclass, (void) spa, (void) vd, (void) zb, (void) zio, | |
1164 | (void) state; | |
34dc7c2f | 1165 | #endif |
ad796b8a | 1166 | return (rc); |
34dc7c2f BB |
1167 | } |
1168 | ||
4f072827 DB |
1169 | /* |
1170 | * Prepare a checksum ereport | |
1171 | * | |
1172 | * Returns | |
1173 | * - 0 if an event was posted | |
1174 | * - EINVAL if there was a problem posting event | |
1175 | * - EBUSY if the event was rate limited | |
1176 | * - EALREADY if the event was already posted (duplicate) | |
1177 | */ | |
1178 | int | |
a2c2ed1b | 1179 | zfs_ereport_start_checksum(spa_t *spa, vdev_t *vd, const zbookmark_phys_t *zb, |
330c6c05 | 1180 | struct zio *zio, uint64_t offset, uint64_t length, zio_bad_cksum_t *info) |
428870ff | 1181 | { |
6078881a TH |
1182 | zio_cksum_report_t *report; |
1183 | ||
6078881a | 1184 | #ifdef _KERNEL |
4f072827 DB |
1185 | if (!zfs_ereport_is_valid(FM_EREPORT_ZFS_CHECKSUM, spa, vd, zio)) |
1186 | return (SET_ERROR(EINVAL)); | |
1187 | ||
1188 | if (zfs_ereport_is_duplicate(FM_EREPORT_ZFS_CHECKSUM, spa, vd, zb, zio, | |
1189 | offset, length)) | |
1190 | return (SET_ERROR(EALREADY)); | |
1191 | ||
6078881a | 1192 | if (zfs_is_ratelimiting_event(FM_EREPORT_ZFS_CHECKSUM, vd)) |
4f072827 | 1193 | return (SET_ERROR(EBUSY)); |
14e4e3cb AZ |
1194 | #else |
1195 | (void) zb, (void) offset; | |
6078881a TH |
1196 | #endif |
1197 | ||
1198 | report = kmem_zalloc(sizeof (*report), KM_SLEEP); | |
428870ff | 1199 | |
330c6c05 | 1200 | zio_vsd_default_cksum_report(zio, report); |
428870ff BB |
1201 | |
1202 | /* copy the checksum failure information if it was provided */ | |
1203 | if (info != NULL) { | |
79c76d5b | 1204 | report->zcr_ckinfo = kmem_zalloc(sizeof (*info), KM_SLEEP); |
861166b0 | 1205 | memcpy(report->zcr_ckinfo, info, sizeof (*info)); |
428870ff BB |
1206 | } |
1207 | ||
b2255edc BB |
1208 | report->zcr_sector = 1ULL << vd->vdev_top->vdev_ashift; |
1209 | report->zcr_align = | |
1210 | vdev_psize_to_asize(vd->vdev_top, report->zcr_sector); | |
428870ff BB |
1211 | report->zcr_length = length; |
1212 | ||
1213 | #ifdef _KERNEL | |
1144586b | 1214 | (void) zfs_ereport_start(&report->zcr_ereport, &report->zcr_detector, |
b5256303 | 1215 | FM_EREPORT_ZFS_CHECKSUM, spa, vd, zb, zio, offset, length); |
428870ff BB |
1216 | |
1217 | if (report->zcr_ereport == NULL) { | |
0426c168 | 1218 | zfs_ereport_free_checksum(report); |
4f072827 | 1219 | return (0); |
428870ff BB |
1220 | } |
1221 | #endif | |
1222 | ||
1223 | mutex_enter(&spa->spa_errlist_lock); | |
1224 | report->zcr_next = zio->io_logical->io_cksum_report; | |
1225 | zio->io_logical->io_cksum_report = report; | |
1226 | mutex_exit(&spa->spa_errlist_lock); | |
4f072827 | 1227 | return (0); |
428870ff BB |
1228 | } |
1229 | ||
1230 | void | |
84c07ada GN |
1231 | zfs_ereport_finish_checksum(zio_cksum_report_t *report, const abd_t *good_data, |
1232 | const abd_t *bad_data, boolean_t drop_if_identical) | |
428870ff BB |
1233 | { |
1234 | #ifdef _KERNEL | |
0426c168 IH |
1235 | zfs_ecksum_info_t *info; |
1236 | ||
428870ff BB |
1237 | info = annotate_ecksum(report->zcr_ereport, report->zcr_ckinfo, |
1238 | good_data, bad_data, report->zcr_length, drop_if_identical); | |
428870ff | 1239 | if (info != NULL) |
26685276 BB |
1240 | zfs_zevent_post(report->zcr_ereport, |
1241 | report->zcr_detector, zfs_zevent_post_cb); | |
0426c168 IH |
1242 | else |
1243 | zfs_zevent_post_cb(report->zcr_ereport, report->zcr_detector); | |
428870ff | 1244 | |
428870ff | 1245 | report->zcr_ereport = report->zcr_detector = NULL; |
428870ff BB |
1246 | if (info != NULL) |
1247 | kmem_free(info, sizeof (*info)); | |
14e4e3cb AZ |
1248 | #else |
1249 | (void) report, (void) good_data, (void) bad_data, | |
1250 | (void) drop_if_identical; | |
428870ff BB |
1251 | #endif |
1252 | } | |
1253 | ||
1254 | void | |
1255 | zfs_ereport_free_checksum(zio_cksum_report_t *rpt) | |
1256 | { | |
1257 | #ifdef _KERNEL | |
1258 | if (rpt->zcr_ereport != NULL) { | |
1259 | fm_nvlist_destroy(rpt->zcr_ereport, | |
1260 | FM_NVA_FREE); | |
1261 | fm_nvlist_destroy(rpt->zcr_detector, | |
1262 | FM_NVA_FREE); | |
1263 | } | |
1264 | #endif | |
1265 | rpt->zcr_free(rpt->zcr_cbdata, rpt->zcr_cbinfo); | |
1266 | ||
1267 | if (rpt->zcr_ckinfo != NULL) | |
1268 | kmem_free(rpt->zcr_ckinfo, sizeof (*rpt->zcr_ckinfo)); | |
1269 | ||
1270 | kmem_free(rpt, sizeof (*rpt)); | |
1271 | } | |
1272 | ||
4f072827 DB |
1273 | /* |
1274 | * Post a checksum ereport | |
1275 | * | |
1276 | * Returns | |
1277 | * - 0 if an event was posted | |
1278 | * - EINVAL if there was a problem posting event | |
1279 | * - EBUSY if the event was rate limited | |
1280 | * - EALREADY if the event was already posted (duplicate) | |
1281 | */ | |
ad796b8a | 1282 | int |
a2c2ed1b | 1283 | zfs_ereport_post_checksum(spa_t *spa, vdev_t *vd, const zbookmark_phys_t *zb, |
428870ff | 1284 | struct zio *zio, uint64_t offset, uint64_t length, |
84c07ada | 1285 | const abd_t *good_data, const abd_t *bad_data, zio_bad_cksum_t *zbc) |
428870ff | 1286 | { |
ad796b8a | 1287 | int rc = 0; |
428870ff BB |
1288 | #ifdef _KERNEL |
1289 | nvlist_t *ereport = NULL; | |
1290 | nvlist_t *detector = NULL; | |
1291 | zfs_ecksum_info_t *info; | |
1292 | ||
4f072827 DB |
1293 | if (!zfs_ereport_is_valid(FM_EREPORT_ZFS_CHECKSUM, spa, vd, zio)) |
1294 | return (SET_ERROR(EINVAL)); | |
1295 | ||
1296 | if (zfs_ereport_is_duplicate(FM_EREPORT_ZFS_CHECKSUM, spa, vd, zb, zio, | |
1297 | offset, length)) | |
1298 | return (SET_ERROR(EALREADY)); | |
1299 | ||
ad796b8a | 1300 | if (zfs_is_ratelimiting_event(FM_EREPORT_ZFS_CHECKSUM, vd)) |
4f072827 | 1301 | return (SET_ERROR(EBUSY)); |
428870ff | 1302 | |
ad796b8a TH |
1303 | if (!zfs_ereport_start(&ereport, &detector, FM_EREPORT_ZFS_CHECKSUM, |
1304 | spa, vd, zb, zio, offset, length) || (ereport == NULL)) { | |
1305 | return (SET_ERROR(EINVAL)); | |
1306 | } | |
428870ff BB |
1307 | |
1308 | info = annotate_ecksum(ereport, zbc, good_data, bad_data, length, | |
1309 | B_FALSE); | |
1310 | ||
26685276 | 1311 | if (info != NULL) { |
ad796b8a | 1312 | rc = zfs_zevent_post(ereport, detector, zfs_zevent_post_cb); |
428870ff | 1313 | kmem_free(info, sizeof (*info)); |
26685276 | 1314 | } |
14e4e3cb AZ |
1315 | #else |
1316 | (void) spa, (void) vd, (void) zb, (void) zio, (void) offset, | |
1317 | (void) length, (void) good_data, (void) bad_data, (void) zbc; | |
428870ff | 1318 | #endif |
ad796b8a | 1319 | return (rc); |
428870ff BB |
1320 | } |
1321 | ||
12fa0466 DE |
1322 | /* |
1323 | * The 'sysevent.fs.zfs.*' events are signals posted to notify user space of | |
1324 | * change in the pool. All sysevents are listed in sys/sysevent/eventdefs.h | |
1325 | * and are designed to be consumed by the ZFS Event Daemon (ZED). For | |
1326 | * additional details refer to the zed(8) man page. | |
1327 | */ | |
1328 | nvlist_t * | |
1329 | zfs_event_create(spa_t *spa, vdev_t *vd, const char *type, const char *name, | |
d02ca379 | 1330 | nvlist_t *aux) |
34dc7c2f | 1331 | { |
12fa0466 | 1332 | nvlist_t *resource = NULL; |
34dc7c2f | 1333 | #ifdef _KERNEL |
34dc7c2f BB |
1334 | char class[64]; |
1335 | ||
428870ff | 1336 | if (spa_load_state(spa) == SPA_LOAD_TRYIMPORT) |
12fa0466 | 1337 | return (NULL); |
428870ff | 1338 | |
34dc7c2f | 1339 | if ((resource = fm_nvlist_create(NULL)) == NULL) |
12fa0466 | 1340 | return (NULL); |
34dc7c2f | 1341 | |
fb390aaf | 1342 | (void) snprintf(class, sizeof (class), "%s.%s.%s", type, |
34dc7c2f | 1343 | ZFS_ERROR_CLASS, name); |
904ea276 BB |
1344 | VERIFY0(nvlist_add_uint8(resource, FM_VERSION, FM_RSRC_VERSION)); |
1345 | VERIFY0(nvlist_add_string(resource, FM_CLASS, class)); | |
bcdb96a3 C |
1346 | VERIFY0(nvlist_add_string(resource, |
1347 | FM_EREPORT_PAYLOAD_ZFS_POOL, spa_name(spa))); | |
904ea276 BB |
1348 | VERIFY0(nvlist_add_uint64(resource, |
1349 | FM_EREPORT_PAYLOAD_ZFS_POOL_GUID, spa_guid(spa))); | |
bcdb96a3 C |
1350 | VERIFY0(nvlist_add_uint64(resource, |
1351 | FM_EREPORT_PAYLOAD_ZFS_POOL_STATE, spa_state(spa))); | |
904ea276 BB |
1352 | VERIFY0(nvlist_add_int32(resource, |
1353 | FM_EREPORT_PAYLOAD_ZFS_POOL_CONTEXT, spa_load_state(spa))); | |
1354 | ||
26685276 | 1355 | if (vd) { |
904ea276 BB |
1356 | VERIFY0(nvlist_add_uint64(resource, |
1357 | FM_EREPORT_PAYLOAD_ZFS_VDEV_GUID, vd->vdev_guid)); | |
1358 | VERIFY0(nvlist_add_uint64(resource, | |
1359 | FM_EREPORT_PAYLOAD_ZFS_VDEV_STATE, vd->vdev_state)); | |
fb390aaf HR |
1360 | if (vd->vdev_path != NULL) |
1361 | VERIFY0(nvlist_add_string(resource, | |
1362 | FM_EREPORT_PAYLOAD_ZFS_VDEV_PATH, vd->vdev_path)); | |
1363 | if (vd->vdev_devid != NULL) | |
1364 | VERIFY0(nvlist_add_string(resource, | |
1365 | FM_EREPORT_PAYLOAD_ZFS_VDEV_DEVID, vd->vdev_devid)); | |
1366 | if (vd->vdev_fru != NULL) | |
1367 | VERIFY0(nvlist_add_string(resource, | |
1368 | FM_EREPORT_PAYLOAD_ZFS_VDEV_FRU, vd->vdev_fru)); | |
6568379e TH |
1369 | if (vd->vdev_enc_sysfs_path != NULL) |
1370 | VERIFY0(nvlist_add_string(resource, | |
1371 | FM_EREPORT_PAYLOAD_ZFS_VDEV_ENC_SYSFS_PATH, | |
1372 | vd->vdev_enc_sysfs_path)); | |
12fa0466 | 1373 | } |
d02ca379 | 1374 | |
12fa0466 DE |
1375 | /* also copy any optional payload data */ |
1376 | if (aux) { | |
1377 | nvpair_t *elem = NULL; | |
1378 | ||
1379 | while ((elem = nvlist_next_nvpair(aux, elem)) != NULL) | |
1380 | (void) nvlist_add_nvpair(resource, elem); | |
26685276 | 1381 | } |
14e4e3cb AZ |
1382 | #else |
1383 | (void) spa, (void) vd, (void) type, (void) name, (void) aux; | |
12fa0466 DE |
1384 | #endif |
1385 | return (resource); | |
1386 | } | |
1387 | ||
1388 | static void | |
1389 | zfs_post_common(spa_t *spa, vdev_t *vd, const char *type, const char *name, | |
1390 | nvlist_t *aux) | |
1391 | { | |
1392 | #ifdef _KERNEL | |
1393 | nvlist_t *resource; | |
1394 | ||
1395 | resource = zfs_event_create(spa, vd, type, name, aux); | |
1396 | if (resource) | |
1397 | zfs_zevent_post(resource, NULL, zfs_zevent_post_cb); | |
14e4e3cb AZ |
1398 | #else |
1399 | (void) spa, (void) vd, (void) type, (void) name, (void) aux; | |
34dc7c2f BB |
1400 | #endif |
1401 | } | |
1402 | ||
34dc7c2f BB |
1403 | /* |
1404 | * The 'resource.fs.zfs.removed' event is an internal signal that the given vdev | |
1405 | * has been removed from the system. This will cause the DE to ignore any | |
1406 | * recent I/O errors, inferring that they are due to the asynchronous device | |
1407 | * removal. | |
1408 | */ | |
1409 | void | |
1410 | zfs_post_remove(spa_t *spa, vdev_t *vd) | |
1411 | { | |
d02ca379 | 1412 | zfs_post_common(spa, vd, FM_RSRC_CLASS, FM_RESOURCE_REMOVED, NULL); |
34dc7c2f BB |
1413 | } |
1414 | ||
1415 | /* | |
1416 | * The 'resource.fs.zfs.autoreplace' event is an internal signal that the pool | |
1417 | * has the 'autoreplace' property set, and therefore any broken vdevs will be | |
1418 | * handled by higher level logic, and no vdev fault should be generated. | |
1419 | */ | |
1420 | void | |
1421 | zfs_post_autoreplace(spa_t *spa, vdev_t *vd) | |
1422 | { | |
d02ca379 | 1423 | zfs_post_common(spa, vd, FM_RSRC_CLASS, FM_RESOURCE_AUTOREPLACE, NULL); |
34dc7c2f | 1424 | } |
428870ff BB |
1425 | |
1426 | /* | |
1427 | * The 'resource.fs.zfs.statechange' event is an internal signal that the | |
1428 | * given vdev has transitioned its state to DEGRADED or HEALTHY. This will | |
1429 | * cause the retire agent to repair any outstanding fault management cases | |
1430 | * open because the device was not found (fault.fs.zfs.device). | |
1431 | */ | |
1432 | void | |
d02ca379 | 1433 | zfs_post_state_change(spa_t *spa, vdev_t *vd, uint64_t laststate) |
428870ff | 1434 | { |
d02ca379 DB |
1435 | #ifdef _KERNEL |
1436 | nvlist_t *aux; | |
1437 | ||
1438 | /* | |
1439 | * Add optional supplemental keys to payload | |
1440 | */ | |
1441 | aux = fm_nvlist_create(NULL); | |
1442 | if (vd && aux) { | |
1443 | if (vd->vdev_physpath) { | |
2a493a4c | 1444 | fnvlist_add_string(aux, |
d02ca379 DB |
1445 | FM_EREPORT_PAYLOAD_ZFS_VDEV_PHYSPATH, |
1446 | vd->vdev_physpath); | |
1447 | } | |
1bbd8770 | 1448 | if (vd->vdev_enc_sysfs_path) { |
2a493a4c | 1449 | fnvlist_add_string(aux, |
1bbd8770 TH |
1450 | FM_EREPORT_PAYLOAD_ZFS_VDEV_ENC_SYSFS_PATH, |
1451 | vd->vdev_enc_sysfs_path); | |
1452 | } | |
1453 | ||
2a493a4c | 1454 | fnvlist_add_uint64(aux, |
d02ca379 DB |
1455 | FM_EREPORT_PAYLOAD_ZFS_VDEV_LASTSTATE, laststate); |
1456 | } | |
1457 | ||
1458 | zfs_post_common(spa, vd, FM_RSRC_CLASS, FM_RESOURCE_STATECHANGE, | |
1459 | aux); | |
1460 | ||
1461 | if (aux) | |
1462 | fm_nvlist_destroy(aux, FM_NVA_FREE); | |
14e4e3cb AZ |
1463 | #else |
1464 | (void) spa, (void) vd, (void) laststate; | |
d02ca379 | 1465 | #endif |
fb390aaf HR |
1466 | } |
1467 | ||
4f072827 DB |
1468 | #ifdef _KERNEL |
1469 | void | |
1470 | zfs_ereport_init(void) | |
1471 | { | |
1472 | mutex_init(&recent_events_lock, NULL, MUTEX_DEFAULT, NULL); | |
1473 | list_create(&recent_events_list, sizeof (recent_events_node_t), | |
1474 | offsetof(recent_events_node_t, re_list_link)); | |
1475 | avl_create(&recent_events_tree, recent_events_compare, | |
1476 | sizeof (recent_events_node_t), offsetof(recent_events_node_t, | |
1477 | re_tree_link)); | |
1478 | } | |
1479 | ||
1480 | /* | |
1481 | * This 'early' fini needs to run before zfs_fini() which on Linux waits | |
1482 | * for the system_delay_taskq to drain. | |
1483 | */ | |
1484 | void | |
1485 | zfs_ereport_taskq_fini(void) | |
1486 | { | |
1487 | mutex_enter(&recent_events_lock); | |
1488 | if (recent_events_cleaner_tqid != 0) { | |
1489 | taskq_cancel_id(system_delay_taskq, recent_events_cleaner_tqid); | |
1490 | recent_events_cleaner_tqid = 0; | |
1491 | } | |
1492 | mutex_exit(&recent_events_lock); | |
1493 | } | |
1494 | ||
1495 | void | |
1496 | zfs_ereport_fini(void) | |
1497 | { | |
1498 | recent_events_node_t *entry; | |
1499 | ||
b3ad3f48 | 1500 | while ((entry = list_remove_head(&recent_events_list)) != NULL) { |
4f072827 | 1501 | avl_remove(&recent_events_tree, entry); |
4f072827 DB |
1502 | kmem_free(entry, sizeof (*entry)); |
1503 | } | |
1504 | avl_destroy(&recent_events_tree); | |
1505 | list_destroy(&recent_events_list); | |
1506 | mutex_destroy(&recent_events_lock); | |
1507 | } | |
1508 | ||
5a54a4e0 JL |
1509 | void |
1510 | zfs_ereport_snapshot_post(const char *subclass, spa_t *spa, const char *name) | |
1511 | { | |
1512 | nvlist_t *aux; | |
1513 | ||
1514 | aux = fm_nvlist_create(NULL); | |
1b87195c | 1515 | fnvlist_add_string(aux, FM_EREPORT_PAYLOAD_ZFS_SNAPSHOT_NAME, name); |
5a54a4e0 JL |
1516 | |
1517 | zfs_post_common(spa, NULL, FM_RSRC_CLASS, subclass, aux); | |
1518 | fm_nvlist_destroy(aux, FM_NVA_FREE); | |
1519 | } | |
1520 | ||
1521 | /* | |
1522 | * Post when a event when a zvol is created or removed | |
1523 | * | |
1524 | * This is currently only used by macOS, since it uses the event to create | |
1525 | * symlinks between the volume name (mypool/myvol) and the actual /dev | |
1526 | * device (/dev/disk3). For example: | |
1527 | * | |
1528 | * /var/run/zfs/dsk/mypool/myvol -> /dev/disk3 | |
1529 | * | |
1530 | * name: The full name of the zvol ("mypool/myvol") | |
1531 | * dev_name: The full /dev name for the zvol ("/dev/disk3") | |
1532 | * raw_name: The raw /dev name for the zvol ("/dev/rdisk3") | |
1533 | */ | |
1534 | void | |
1535 | zfs_ereport_zvol_post(const char *subclass, const char *name, | |
1536 | const char *dev_name, const char *raw_name) | |
1537 | { | |
1538 | nvlist_t *aux; | |
1539 | char *r; | |
1540 | ||
1541 | boolean_t locked = mutex_owned(&spa_namespace_lock); | |
1542 | if (!locked) mutex_enter(&spa_namespace_lock); | |
1543 | spa_t *spa = spa_lookup(name); | |
1544 | if (!locked) mutex_exit(&spa_namespace_lock); | |
1545 | ||
1546 | if (spa == NULL) | |
1547 | return; | |
1548 | ||
1549 | aux = fm_nvlist_create(NULL); | |
2a493a4c RY |
1550 | fnvlist_add_string(aux, FM_EREPORT_PAYLOAD_ZFS_DEVICE_NAME, dev_name); |
1551 | fnvlist_add_string(aux, FM_EREPORT_PAYLOAD_ZFS_RAW_DEVICE_NAME, | |
5a54a4e0 JL |
1552 | raw_name); |
1553 | r = strchr(name, '/'); | |
1554 | if (r && r[1]) | |
2a493a4c | 1555 | fnvlist_add_string(aux, FM_EREPORT_PAYLOAD_ZFS_VOLUME, &r[1]); |
5a54a4e0 JL |
1556 | |
1557 | zfs_post_common(spa, NULL, FM_RSRC_CLASS, subclass, aux); | |
1558 | fm_nvlist_destroy(aux, FM_NVA_FREE); | |
1559 | } | |
1560 | ||
26685276 | 1561 | EXPORT_SYMBOL(zfs_ereport_post); |
ad796b8a | 1562 | EXPORT_SYMBOL(zfs_ereport_is_valid); |
26685276 BB |
1563 | EXPORT_SYMBOL(zfs_ereport_post_checksum); |
1564 | EXPORT_SYMBOL(zfs_post_remove); | |
1565 | EXPORT_SYMBOL(zfs_post_autoreplace); | |
1566 | EXPORT_SYMBOL(zfs_post_state_change); | |
4f072827 DB |
1567 | |
1568 | ZFS_MODULE_PARAM(zfs_zevent, zfs_zevent_, retain_max, UINT, ZMOD_RW, | |
1569 | "Maximum recent zevents records to retain for duplicate checking"); | |
1570 | ZFS_MODULE_PARAM(zfs_zevent, zfs_zevent_, retain_expire_secs, UINT, ZMOD_RW, | |
1571 | "Expiration time for recent zevents records"); | |
26685276 | 1572 | #endif /* _KERNEL */ |