]>
Commit | Line | Data |
---|---|---|
34dc7c2f BB |
1 | /* |
2 | * CDDL HEADER START | |
3 | * | |
4 | * The contents of this file are subject to the terms of the | |
5 | * Common Development and Distribution License (the "License"). | |
6 | * You may not use this file except in compliance with the License. | |
7 | * | |
8 | * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE | |
9 | * or http://www.opensolaris.org/os/licensing. | |
10 | * See the License for the specific language governing permissions | |
11 | * and limitations under the License. | |
12 | * | |
13 | * When distributing Covered Code, include this CDDL HEADER in each | |
14 | * file and include the License file at usr/src/OPENSOLARIS.LICENSE. | |
15 | * If applicable, add the following below this CDDL HEADER, with the | |
16 | * fields enclosed by brackets "[]" replaced with your own identifying | |
17 | * information: Portions Copyright [yyyy] [name of copyright owner] | |
18 | * | |
19 | * CDDL HEADER END | |
20 | */ | |
21 | /* | |
9babb374 | 22 | * Copyright 2009 Sun Microsystems, Inc. All rights reserved. |
34dc7c2f BB |
23 | * Use is subject to license terms. |
24 | */ | |
25 | ||
5ffb9d1d GW |
26 | /* |
27 | * Copyright (c) 2012 by Delphix. All rights reserved. | |
28 | */ | |
29 | ||
34dc7c2f BB |
30 | #include <sys/spa.h> |
31 | #include <sys/spa_impl.h> | |
32 | #include <sys/vdev.h> | |
33 | #include <sys/vdev_impl.h> | |
34 | #include <sys/zio.h> | |
428870ff | 35 | #include <sys/zio_checksum.h> |
34dc7c2f BB |
36 | |
37 | #include <sys/fm/fs/zfs.h> | |
38 | #include <sys/fm/protocol.h> | |
39 | #include <sys/fm/util.h> | |
40 | #include <sys/sysevent.h> | |
41 | ||
42 | /* | |
43 | * This general routine is responsible for generating all the different ZFS | |
44 | * ereports. The payload is dependent on the class, and which arguments are | |
45 | * supplied to the function: | |
46 | * | |
47 | * EREPORT POOL VDEV IO | |
48 | * block X X X | |
49 | * data X X | |
50 | * device X X | |
51 | * pool X | |
52 | * | |
53 | * If we are in a loading state, all errors are chained together by the same | |
b128c09f | 54 | * SPA-wide ENA (Error Numeric Association). |
34dc7c2f BB |
55 | * |
56 | * For isolated I/O requests, we get the ENA from the zio_t. The propagation | |
57 | * gets very complicated due to RAID-Z, gang blocks, and vdev caching. We want | |
58 | * to chain together all ereports associated with a logical piece of data. For | |
59 | * read I/Os, there are basically three 'types' of I/O, which form a roughly | |
60 | * layered diagram: | |
61 | * | |
62 | * +---------------+ | |
63 | * | Aggregate I/O | No associated logical data or device | |
64 | * +---------------+ | |
65 | * | | |
66 | * V | |
67 | * +---------------+ Reads associated with a piece of logical data. | |
68 | * | Read I/O | This includes reads on behalf of RAID-Z, | |
69 | * +---------------+ mirrors, gang blocks, retries, etc. | |
70 | * | | |
71 | * V | |
72 | * +---------------+ Reads associated with a particular device, but | |
73 | * | Physical I/O | no logical data. Issued as part of vdev caching | |
74 | * +---------------+ and I/O aggregation. | |
75 | * | |
76 | * Note that 'physical I/O' here is not the same terminology as used in the rest | |
77 | * of ZIO. Typically, 'physical I/O' simply means that there is no attached | |
78 | * blockpointer. But I/O with no associated block pointer can still be related | |
79 | * to a logical piece of data (i.e. RAID-Z requests). | |
80 | * | |
81 | * Purely physical I/O always have unique ENAs. They are not related to a | |
82 | * particular piece of logical data, and therefore cannot be chained together. | |
83 | * We still generate an ereport, but the DE doesn't correlate it with any | |
84 | * logical piece of data. When such an I/O fails, the delegated I/O requests | |
85 | * will issue a retry, which will trigger the 'real' ereport with the correct | |
86 | * ENA. | |
87 | * | |
88 | * We keep track of the ENA for a ZIO chain through the 'io_logical' member. | |
89 | * When a new logical I/O is issued, we set this to point to itself. Child I/Os | |
90 | * then inherit this pointer, so that when it is first set subsequent failures | |
b128c09f BB |
91 | * will use the same ENA. For vdev cache fill and queue aggregation I/O, |
92 | * this pointer is set to NULL, and no ereport will be generated (since it | |
93 | * doesn't actually correspond to any particular device or piece of data, | |
94 | * and the caller will always retry without caching or queueing anyway). | |
428870ff BB |
95 | * |
96 | * For checksum errors, we want to include more information about the actual | |
97 | * error which occurs. Accordingly, we build an ereport when the error is | |
98 | * noticed, but instead of sending it in immediately, we hang it off of the | |
99 | * io_cksum_report field of the logical IO. When the logical IO completes | |
100 | * (successfully or not), zfs_ereport_finish_checksum() is called with the | |
101 | * good and bad versions of the buffer (if available), and we annotate the | |
102 | * ereport with information about the differences. | |
34dc7c2f | 103 | */ |
428870ff | 104 | #ifdef _KERNEL |
12fa0466 | 105 | void |
26685276 BB |
106 | zfs_zevent_post_cb(nvlist_t *nvl, nvlist_t *detector) |
107 | { | |
108 | if (nvl) | |
109 | fm_nvlist_destroy(nvl, FM_NVA_FREE); | |
110 | ||
111 | if (detector) | |
112 | fm_nvlist_destroy(detector, FM_NVA_FREE); | |
113 | } | |
114 | ||
6078881a TH |
115 | /* |
116 | * We want to rate limit ZIO delay and checksum events so as to not | |
117 | * flood ZED when a disk is acting up. | |
118 | * | |
119 | * Returns 1 if we're ratelimiting, 0 if not. | |
120 | */ | |
121 | static int | |
122 | zfs_is_ratelimiting_event(const char *subclass, vdev_t *vd) | |
123 | { | |
124 | int rc = 0; | |
125 | /* | |
126 | * __ratelimit() returns 1 if we're *not* ratelimiting and 0 if we | |
127 | * are. Invert it to get our return value. | |
128 | */ | |
129 | if (strcmp(subclass, FM_EREPORT_ZFS_DELAY) == 0) { | |
130 | rc = !zfs_ratelimit(&vd->vdev_delay_rl); | |
131 | } else if (strcmp(subclass, FM_EREPORT_ZFS_CHECKSUM) == 0) { | |
132 | rc = !zfs_ratelimit(&vd->vdev_checksum_rl); | |
133 | } | |
134 | ||
135 | if (rc) { | |
136 | /* We're rate limiting */ | |
137 | fm_erpt_dropped_increment(); | |
138 | } | |
139 | ||
140 | return (rc); | |
141 | } | |
0426c168 | 142 | |
ad796b8a TH |
143 | /* |
144 | * Return B_TRUE if the event actually posted, B_FALSE if not. | |
145 | */ | |
146 | static boolean_t | |
428870ff | 147 | zfs_ereport_start(nvlist_t **ereport_out, nvlist_t **detector_out, |
a2c2ed1b | 148 | const char *subclass, spa_t *spa, vdev_t *vd, const zbookmark_phys_t *zb, |
b5256303 | 149 | zio_t *zio, uint64_t stateoroffset, uint64_t size) |
34dc7c2f | 150 | { |
34dc7c2f | 151 | nvlist_t *ereport, *detector; |
428870ff | 152 | |
34dc7c2f BB |
153 | uint64_t ena; |
154 | char class[64]; | |
155 | ||
ad796b8a TH |
156 | if (!zfs_ereport_is_valid(subclass, spa, vd, zio)) |
157 | return (B_FALSE); | |
a32df59e | 158 | |
34dc7c2f | 159 | if ((ereport = fm_nvlist_create(NULL)) == NULL) |
ad796b8a | 160 | return (B_FALSE); |
34dc7c2f BB |
161 | |
162 | if ((detector = fm_nvlist_create(NULL)) == NULL) { | |
163 | fm_nvlist_destroy(ereport, FM_NVA_FREE); | |
ad796b8a | 164 | return (B_FALSE); |
34dc7c2f BB |
165 | } |
166 | ||
167 | /* | |
168 | * Serialize ereport generation | |
169 | */ | |
170 | mutex_enter(&spa->spa_errlist_lock); | |
171 | ||
172 | /* | |
173 | * Determine the ENA to use for this event. If we are in a loading | |
174 | * state, use a SPA-wide ENA. Otherwise, if we are in an I/O state, use | |
175 | * a root zio-wide ENA. Otherwise, simply use a unique ENA. | |
176 | */ | |
428870ff | 177 | if (spa_load_state(spa) != SPA_LOAD_NONE) { |
34dc7c2f BB |
178 | if (spa->spa_ena == 0) |
179 | spa->spa_ena = fm_ena_generate(0, FM_ENA_FMT1); | |
180 | ena = spa->spa_ena; | |
181 | } else if (zio != NULL && zio->io_logical != NULL) { | |
182 | if (zio->io_logical->io_ena == 0) | |
183 | zio->io_logical->io_ena = | |
184 | fm_ena_generate(0, FM_ENA_FMT1); | |
185 | ena = zio->io_logical->io_ena; | |
186 | } else { | |
187 | ena = fm_ena_generate(0, FM_ENA_FMT1); | |
188 | } | |
189 | ||
190 | /* | |
191 | * Construct the full class, detector, and other standard FMA fields. | |
192 | */ | |
193 | (void) snprintf(class, sizeof (class), "%s.%s", | |
194 | ZFS_ERROR_CLASS, subclass); | |
195 | ||
196 | fm_fmri_zfs_set(detector, FM_ZFS_SCHEME_VERSION, spa_guid(spa), | |
197 | vd != NULL ? vd->vdev_guid : 0); | |
198 | ||
199 | fm_ereport_set(ereport, FM_EREPORT_VERSION, class, ena, detector, NULL); | |
200 | ||
201 | /* | |
202 | * Construct the per-ereport payload, depending on which parameters are | |
203 | * passed in. | |
204 | */ | |
205 | ||
206 | /* | |
207 | * Generic payload members common to all ereports. | |
34dc7c2f | 208 | */ |
bcdb96a3 C |
209 | fm_payload_set(ereport, |
210 | FM_EREPORT_PAYLOAD_ZFS_POOL, DATA_TYPE_STRING, spa_name(spa), | |
211 | FM_EREPORT_PAYLOAD_ZFS_POOL_GUID, DATA_TYPE_UINT64, spa_guid(spa), | |
177c91d0 DB |
212 | FM_EREPORT_PAYLOAD_ZFS_POOL_STATE, DATA_TYPE_UINT64, |
213 | (uint64_t)spa_state(spa), | |
34dc7c2f | 214 | FM_EREPORT_PAYLOAD_ZFS_POOL_CONTEXT, DATA_TYPE_INT32, |
177c91d0 | 215 | (int32_t)spa_load_state(spa), NULL); |
b128c09f | 216 | |
a36cc8d2 | 217 | fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_POOL_FAILMODE, |
218 | DATA_TYPE_STRING, | |
219 | spa_get_failmode(spa) == ZIO_FAILURE_MODE_WAIT ? | |
220 | FM_EREPORT_FAILMODE_WAIT : | |
221 | spa_get_failmode(spa) == ZIO_FAILURE_MODE_CONTINUE ? | |
222 | FM_EREPORT_FAILMODE_CONTINUE : FM_EREPORT_FAILMODE_PANIC, | |
223 | NULL); | |
34dc7c2f BB |
224 | |
225 | if (vd != NULL) { | |
226 | vdev_t *pvd = vd->vdev_parent; | |
cc92e9d0 | 227 | vdev_queue_t *vq = &vd->vdev_queue; |
904ea276 BB |
228 | vdev_stat_t *vs = &vd->vdev_stat; |
229 | vdev_t *spare_vd; | |
230 | uint64_t *spare_guids; | |
231 | char **spare_paths; | |
232 | int i, spare_count; | |
34dc7c2f BB |
233 | |
234 | fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_VDEV_GUID, | |
235 | DATA_TYPE_UINT64, vd->vdev_guid, | |
236 | FM_EREPORT_PAYLOAD_ZFS_VDEV_TYPE, | |
237 | DATA_TYPE_STRING, vd->vdev_ops->vdev_op_type, NULL); | |
9babb374 | 238 | if (vd->vdev_path != NULL) |
34dc7c2f BB |
239 | fm_payload_set(ereport, |
240 | FM_EREPORT_PAYLOAD_ZFS_VDEV_PATH, | |
241 | DATA_TYPE_STRING, vd->vdev_path, NULL); | |
9babb374 | 242 | if (vd->vdev_devid != NULL) |
34dc7c2f BB |
243 | fm_payload_set(ereport, |
244 | FM_EREPORT_PAYLOAD_ZFS_VDEV_DEVID, | |
245 | DATA_TYPE_STRING, vd->vdev_devid, NULL); | |
9babb374 BB |
246 | if (vd->vdev_fru != NULL) |
247 | fm_payload_set(ereport, | |
248 | FM_EREPORT_PAYLOAD_ZFS_VDEV_FRU, | |
249 | DATA_TYPE_STRING, vd->vdev_fru, NULL); | |
6568379e TH |
250 | if (vd->vdev_enc_sysfs_path != NULL) |
251 | fm_payload_set(ereport, | |
252 | FM_EREPORT_PAYLOAD_ZFS_VDEV_ENC_SYSFS_PATH, | |
253 | DATA_TYPE_STRING, vd->vdev_enc_sysfs_path, NULL); | |
32a9872b GW |
254 | if (vd->vdev_ashift) |
255 | fm_payload_set(ereport, | |
256 | FM_EREPORT_PAYLOAD_ZFS_VDEV_ASHIFT, | |
257 | DATA_TYPE_UINT64, vd->vdev_ashift, NULL); | |
34dc7c2f | 258 | |
cc92e9d0 GW |
259 | if (vq != NULL) { |
260 | fm_payload_set(ereport, | |
261 | FM_EREPORT_PAYLOAD_ZFS_VDEV_COMP_TS, | |
262 | DATA_TYPE_UINT64, vq->vq_io_complete_ts, NULL); | |
263 | fm_payload_set(ereport, | |
264 | FM_EREPORT_PAYLOAD_ZFS_VDEV_DELTA_TS, | |
265 | DATA_TYPE_UINT64, vq->vq_io_delta_ts, NULL); | |
266 | } | |
267 | ||
904ea276 BB |
268 | if (vs != NULL) { |
269 | fm_payload_set(ereport, | |
270 | FM_EREPORT_PAYLOAD_ZFS_VDEV_READ_ERRORS, | |
271 | DATA_TYPE_UINT64, vs->vs_read_errors, | |
272 | FM_EREPORT_PAYLOAD_ZFS_VDEV_WRITE_ERRORS, | |
273 | DATA_TYPE_UINT64, vs->vs_write_errors, | |
274 | FM_EREPORT_PAYLOAD_ZFS_VDEV_CKSUM_ERRORS, | |
ad796b8a TH |
275 | DATA_TYPE_UINT64, vs->vs_checksum_errors, |
276 | FM_EREPORT_PAYLOAD_ZFS_VDEV_DELAYS, | |
277 | DATA_TYPE_UINT64, vs->vs_slow_ios, | |
278 | NULL); | |
904ea276 BB |
279 | } |
280 | ||
34dc7c2f BB |
281 | if (pvd != NULL) { |
282 | fm_payload_set(ereport, | |
283 | FM_EREPORT_PAYLOAD_ZFS_PARENT_GUID, | |
284 | DATA_TYPE_UINT64, pvd->vdev_guid, | |
285 | FM_EREPORT_PAYLOAD_ZFS_PARENT_TYPE, | |
286 | DATA_TYPE_STRING, pvd->vdev_ops->vdev_op_type, | |
287 | NULL); | |
288 | if (pvd->vdev_path) | |
289 | fm_payload_set(ereport, | |
290 | FM_EREPORT_PAYLOAD_ZFS_PARENT_PATH, | |
291 | DATA_TYPE_STRING, pvd->vdev_path, NULL); | |
292 | if (pvd->vdev_devid) | |
293 | fm_payload_set(ereport, | |
294 | FM_EREPORT_PAYLOAD_ZFS_PARENT_DEVID, | |
295 | DATA_TYPE_STRING, pvd->vdev_devid, NULL); | |
296 | } | |
904ea276 BB |
297 | |
298 | spare_count = spa->spa_spares.sav_count; | |
299 | spare_paths = kmem_zalloc(sizeof (char *) * spare_count, | |
79c76d5b | 300 | KM_SLEEP); |
904ea276 | 301 | spare_guids = kmem_zalloc(sizeof (uint64_t) * spare_count, |
79c76d5b | 302 | KM_SLEEP); |
904ea276 BB |
303 | |
304 | for (i = 0; i < spare_count; i++) { | |
305 | spare_vd = spa->spa_spares.sav_vdevs[i]; | |
306 | if (spare_vd) { | |
307 | spare_paths[i] = spare_vd->vdev_path; | |
308 | spare_guids[i] = spare_vd->vdev_guid; | |
309 | } | |
310 | } | |
311 | ||
312 | fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_VDEV_SPARE_PATHS, | |
313 | DATA_TYPE_STRING_ARRAY, spare_count, spare_paths, | |
314 | FM_EREPORT_PAYLOAD_ZFS_VDEV_SPARE_GUIDS, | |
315 | DATA_TYPE_UINT64_ARRAY, spare_count, spare_guids, NULL); | |
316 | ||
317 | kmem_free(spare_guids, sizeof (uint64_t) * spare_count); | |
318 | kmem_free(spare_paths, sizeof (char *) * spare_count); | |
34dc7c2f BB |
319 | } |
320 | ||
321 | if (zio != NULL) { | |
322 | /* | |
323 | * Payload common to all I/Os. | |
324 | */ | |
325 | fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_ZIO_ERR, | |
326 | DATA_TYPE_INT32, zio->io_error, NULL); | |
312c07ed BB |
327 | fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_ZIO_FLAGS, |
328 | DATA_TYPE_INT32, zio->io_flags, NULL); | |
9dcb9719 BB |
329 | fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_ZIO_STAGE, |
330 | DATA_TYPE_UINT32, zio->io_stage, NULL); | |
331 | fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_ZIO_PIPELINE, | |
332 | DATA_TYPE_UINT32, zio->io_pipeline, NULL); | |
a69052be BB |
333 | fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_ZIO_DELAY, |
334 | DATA_TYPE_UINT64, zio->io_delay, NULL); | |
cc92e9d0 GW |
335 | fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_ZIO_TIMESTAMP, |
336 | DATA_TYPE_UINT64, zio->io_timestamp, NULL); | |
cc92e9d0 GW |
337 | fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_ZIO_DELTA, |
338 | DATA_TYPE_UINT64, zio->io_delta, NULL); | |
34dc7c2f BB |
339 | |
340 | /* | |
341 | * If the 'size' parameter is non-zero, it indicates this is a | |
342 | * RAID-Z or other I/O where the physical offset and length are | |
343 | * provided for us, instead of within the zio_t. | |
344 | */ | |
345 | if (vd != NULL) { | |
346 | if (size) | |
347 | fm_payload_set(ereport, | |
348 | FM_EREPORT_PAYLOAD_ZFS_ZIO_OFFSET, | |
349 | DATA_TYPE_UINT64, stateoroffset, | |
350 | FM_EREPORT_PAYLOAD_ZFS_ZIO_SIZE, | |
351 | DATA_TYPE_UINT64, size, NULL); | |
352 | else | |
353 | fm_payload_set(ereport, | |
354 | FM_EREPORT_PAYLOAD_ZFS_ZIO_OFFSET, | |
355 | DATA_TYPE_UINT64, zio->io_offset, | |
356 | FM_EREPORT_PAYLOAD_ZFS_ZIO_SIZE, | |
357 | DATA_TYPE_UINT64, zio->io_size, NULL); | |
358 | } | |
34dc7c2f BB |
359 | } else if (vd != NULL) { |
360 | /* | |
361 | * If we have a vdev but no zio, this is a device fault, and the | |
362 | * 'stateoroffset' parameter indicates the previous state of the | |
363 | * vdev. | |
364 | */ | |
365 | fm_payload_set(ereport, | |
366 | FM_EREPORT_PAYLOAD_ZFS_PREV_STATE, | |
367 | DATA_TYPE_UINT64, stateoroffset, NULL); | |
368 | } | |
428870ff | 369 | |
b5256303 TC |
370 | /* |
371 | * Payload for I/Os with corresponding logical information. | |
372 | */ | |
ad796b8a | 373 | if (zb != NULL && (zio == NULL || zio->io_logical != NULL)) { |
b5256303 TC |
374 | fm_payload_set(ereport, |
375 | FM_EREPORT_PAYLOAD_ZFS_ZIO_OBJSET, | |
376 | DATA_TYPE_UINT64, zb->zb_objset, | |
377 | FM_EREPORT_PAYLOAD_ZFS_ZIO_OBJECT, | |
378 | DATA_TYPE_UINT64, zb->zb_object, | |
379 | FM_EREPORT_PAYLOAD_ZFS_ZIO_LEVEL, | |
380 | DATA_TYPE_INT64, zb->zb_level, | |
381 | FM_EREPORT_PAYLOAD_ZFS_ZIO_BLKID, | |
382 | DATA_TYPE_UINT64, zb->zb_blkid, NULL); | |
ad796b8a | 383 | } |
b5256303 | 384 | |
34dc7c2f BB |
385 | mutex_exit(&spa->spa_errlist_lock); |
386 | ||
428870ff BB |
387 | *ereport_out = ereport; |
388 | *detector_out = detector; | |
ad796b8a | 389 | return (B_TRUE); |
428870ff BB |
390 | } |
391 | ||
392 | /* if it's <= 128 bytes, save the corruption directly */ | |
393 | #define ZFM_MAX_INLINE (128 / sizeof (uint64_t)) | |
394 | ||
395 | #define MAX_RANGES 16 | |
396 | ||
397 | typedef struct zfs_ecksum_info { | |
398 | /* histograms of set and cleared bits by bit number in a 64-bit word */ | |
1b18c6d7 AG |
399 | uint32_t zei_histogram_set[sizeof (uint64_t) * NBBY]; |
400 | uint32_t zei_histogram_cleared[sizeof (uint64_t) * NBBY]; | |
428870ff BB |
401 | |
402 | /* inline arrays of bits set and cleared. */ | |
403 | uint64_t zei_bits_set[ZFM_MAX_INLINE]; | |
404 | uint64_t zei_bits_cleared[ZFM_MAX_INLINE]; | |
405 | ||
406 | /* | |
407 | * for each range, the number of bits set and cleared. The Hamming | |
408 | * distance between the good and bad buffers is the sum of them all. | |
409 | */ | |
410 | uint32_t zei_range_sets[MAX_RANGES]; | |
411 | uint32_t zei_range_clears[MAX_RANGES]; | |
412 | ||
413 | struct zei_ranges { | |
414 | uint32_t zr_start; | |
415 | uint32_t zr_end; | |
416 | } zei_ranges[MAX_RANGES]; | |
417 | ||
418 | size_t zei_range_count; | |
419 | uint32_t zei_mingap; | |
420 | uint32_t zei_allowed_mingap; | |
421 | ||
422 | } zfs_ecksum_info_t; | |
423 | ||
424 | static void | |
1b18c6d7 | 425 | update_histogram(uint64_t value_arg, uint32_t *hist, uint32_t *count) |
428870ff BB |
426 | { |
427 | size_t i; | |
428 | size_t bits = 0; | |
429 | uint64_t value = BE_64(value_arg); | |
430 | ||
431 | /* We store the bits in big-endian (largest-first) order */ | |
432 | for (i = 0; i < 64; i++) { | |
433 | if (value & (1ull << i)) { | |
cf232b53 | 434 | hist[63 - i]++; |
428870ff BB |
435 | ++bits; |
436 | } | |
437 | } | |
438 | /* update the count of bits changed */ | |
439 | *count += bits; | |
440 | } | |
441 | ||
442 | /* | |
443 | * We've now filled up the range array, and need to increase "mingap" and | |
444 | * shrink the range list accordingly. zei_mingap is always the smallest | |
445 | * distance between array entries, so we set the new_allowed_gap to be | |
446 | * one greater than that. We then go through the list, joining together | |
447 | * any ranges which are closer than the new_allowed_gap. | |
448 | * | |
449 | * By construction, there will be at least one. We also update zei_mingap | |
450 | * to the new smallest gap, to prepare for our next invocation. | |
451 | */ | |
452 | static void | |
26685276 | 453 | zei_shrink_ranges(zfs_ecksum_info_t *eip) |
428870ff BB |
454 | { |
455 | uint32_t mingap = UINT32_MAX; | |
456 | uint32_t new_allowed_gap = eip->zei_mingap + 1; | |
457 | ||
458 | size_t idx, output; | |
459 | size_t max = eip->zei_range_count; | |
460 | ||
461 | struct zei_ranges *r = eip->zei_ranges; | |
462 | ||
463 | ASSERT3U(eip->zei_range_count, >, 0); | |
464 | ASSERT3U(eip->zei_range_count, <=, MAX_RANGES); | |
465 | ||
466 | output = idx = 0; | |
467 | while (idx < max - 1) { | |
468 | uint32_t start = r[idx].zr_start; | |
469 | uint32_t end = r[idx].zr_end; | |
470 | ||
471 | while (idx < max - 1) { | |
26685276 | 472 | idx++; |
428870ff | 473 | |
1c27024e DB |
474 | uint32_t nstart = r[idx].zr_start; |
475 | uint32_t nend = r[idx].zr_end; | |
476 | ||
477 | uint32_t gap = nstart - end; | |
428870ff BB |
478 | if (gap < new_allowed_gap) { |
479 | end = nend; | |
480 | continue; | |
481 | } | |
482 | if (gap < mingap) | |
483 | mingap = gap; | |
484 | break; | |
485 | } | |
486 | r[output].zr_start = start; | |
487 | r[output].zr_end = end; | |
488 | output++; | |
489 | } | |
490 | ASSERT3U(output, <, eip->zei_range_count); | |
491 | eip->zei_range_count = output; | |
492 | eip->zei_mingap = mingap; | |
493 | eip->zei_allowed_mingap = new_allowed_gap; | |
494 | } | |
495 | ||
496 | static void | |
26685276 | 497 | zei_add_range(zfs_ecksum_info_t *eip, int start, int end) |
428870ff BB |
498 | { |
499 | struct zei_ranges *r = eip->zei_ranges; | |
500 | size_t count = eip->zei_range_count; | |
501 | ||
502 | if (count >= MAX_RANGES) { | |
26685276 | 503 | zei_shrink_ranges(eip); |
428870ff BB |
504 | count = eip->zei_range_count; |
505 | } | |
506 | if (count == 0) { | |
507 | eip->zei_mingap = UINT32_MAX; | |
508 | eip->zei_allowed_mingap = 1; | |
509 | } else { | |
510 | int gap = start - r[count - 1].zr_end; | |
511 | ||
512 | if (gap < eip->zei_allowed_mingap) { | |
513 | r[count - 1].zr_end = end; | |
514 | return; | |
515 | } | |
516 | if (gap < eip->zei_mingap) | |
517 | eip->zei_mingap = gap; | |
518 | } | |
519 | r[count].zr_start = start; | |
520 | r[count].zr_end = end; | |
521 | eip->zei_range_count++; | |
522 | } | |
523 | ||
524 | static size_t | |
26685276 | 525 | zei_range_total_size(zfs_ecksum_info_t *eip) |
428870ff BB |
526 | { |
527 | struct zei_ranges *r = eip->zei_ranges; | |
528 | size_t count = eip->zei_range_count; | |
529 | size_t result = 0; | |
530 | size_t idx; | |
531 | ||
532 | for (idx = 0; idx < count; idx++) | |
533 | result += (r[idx].zr_end - r[idx].zr_start); | |
534 | ||
535 | return (result); | |
536 | } | |
537 | ||
538 | static zfs_ecksum_info_t * | |
539 | annotate_ecksum(nvlist_t *ereport, zio_bad_cksum_t *info, | |
84c07ada | 540 | const abd_t *goodabd, const abd_t *badabd, size_t size, |
428870ff BB |
541 | boolean_t drop_if_identical) |
542 | { | |
84c07ada GN |
543 | const uint64_t *good; |
544 | const uint64_t *bad; | |
428870ff BB |
545 | |
546 | uint64_t allset = 0; | |
547 | uint64_t allcleared = 0; | |
548 | ||
549 | size_t nui64s = size / sizeof (uint64_t); | |
550 | ||
551 | size_t inline_size; | |
552 | int no_inline = 0; | |
553 | size_t idx; | |
554 | size_t range; | |
555 | ||
556 | size_t offset = 0; | |
557 | ssize_t start = -1; | |
558 | ||
79c76d5b | 559 | zfs_ecksum_info_t *eip = kmem_zalloc(sizeof (*eip), KM_SLEEP); |
428870ff BB |
560 | |
561 | /* don't do any annotation for injected checksum errors */ | |
562 | if (info != NULL && info->zbc_injected) | |
563 | return (eip); | |
564 | ||
565 | if (info != NULL && info->zbc_has_cksum) { | |
566 | fm_payload_set(ereport, | |
567 | FM_EREPORT_PAYLOAD_ZFS_CKSUM_EXPECTED, | |
568 | DATA_TYPE_UINT64_ARRAY, | |
569 | sizeof (info->zbc_expected) / sizeof (uint64_t), | |
570 | (uint64_t *)&info->zbc_expected, | |
571 | FM_EREPORT_PAYLOAD_ZFS_CKSUM_ACTUAL, | |
572 | DATA_TYPE_UINT64_ARRAY, | |
573 | sizeof (info->zbc_actual) / sizeof (uint64_t), | |
574 | (uint64_t *)&info->zbc_actual, | |
575 | FM_EREPORT_PAYLOAD_ZFS_CKSUM_ALGO, | |
576 | DATA_TYPE_STRING, | |
577 | info->zbc_checksum_name, | |
578 | NULL); | |
579 | ||
580 | if (info->zbc_byteswapped) { | |
581 | fm_payload_set(ereport, | |
582 | FM_EREPORT_PAYLOAD_ZFS_CKSUM_BYTESWAP, | |
583 | DATA_TYPE_BOOLEAN, 1, | |
584 | NULL); | |
585 | } | |
586 | } | |
587 | ||
84c07ada | 588 | if (badabd == NULL || goodabd == NULL) |
428870ff BB |
589 | return (eip); |
590 | ||
1b18c6d7 | 591 | ASSERT3U(nui64s, <=, UINT32_MAX); |
428870ff BB |
592 | ASSERT3U(size, ==, nui64s * sizeof (uint64_t)); |
593 | ASSERT3U(size, <=, SPA_MAXBLOCKSIZE); | |
594 | ASSERT3U(size, <=, UINT32_MAX); | |
595 | ||
84c07ada GN |
596 | good = (const uint64_t *) abd_borrow_buf_copy((abd_t *)goodabd, size); |
597 | bad = (const uint64_t *) abd_borrow_buf_copy((abd_t *)badabd, size); | |
598 | ||
428870ff BB |
599 | /* build up the range list by comparing the two buffers. */ |
600 | for (idx = 0; idx < nui64s; idx++) { | |
601 | if (good[idx] == bad[idx]) { | |
602 | if (start == -1) | |
603 | continue; | |
604 | ||
26685276 | 605 | zei_add_range(eip, start, idx); |
428870ff BB |
606 | start = -1; |
607 | } else { | |
608 | if (start != -1) | |
609 | continue; | |
610 | ||
611 | start = idx; | |
612 | } | |
613 | } | |
614 | if (start != -1) | |
26685276 | 615 | zei_add_range(eip, start, idx); |
428870ff BB |
616 | |
617 | /* See if it will fit in our inline buffers */ | |
26685276 | 618 | inline_size = zei_range_total_size(eip); |
428870ff BB |
619 | if (inline_size > ZFM_MAX_INLINE) |
620 | no_inline = 1; | |
621 | ||
622 | /* | |
623 | * If there is no change and we want to drop if the buffers are | |
624 | * identical, do so. | |
625 | */ | |
626 | if (inline_size == 0 && drop_if_identical) { | |
627 | kmem_free(eip, sizeof (*eip)); | |
84c07ada GN |
628 | abd_return_buf((abd_t *)goodabd, (void *)good, size); |
629 | abd_return_buf((abd_t *)badabd, (void *)bad, size); | |
428870ff BB |
630 | return (NULL); |
631 | } | |
632 | ||
633 | /* | |
634 | * Now walk through the ranges, filling in the details of the | |
635 | * differences. Also convert our uint64_t-array offsets to byte | |
636 | * offsets. | |
637 | */ | |
638 | for (range = 0; range < eip->zei_range_count; range++) { | |
639 | size_t start = eip->zei_ranges[range].zr_start; | |
640 | size_t end = eip->zei_ranges[range].zr_end; | |
641 | ||
642 | for (idx = start; idx < end; idx++) { | |
643 | uint64_t set, cleared; | |
644 | ||
645 | // bits set in bad, but not in good | |
646 | set = ((~good[idx]) & bad[idx]); | |
647 | // bits set in good, but not in bad | |
648 | cleared = (good[idx] & (~bad[idx])); | |
649 | ||
650 | allset |= set; | |
651 | allcleared |= cleared; | |
652 | ||
653 | if (!no_inline) { | |
654 | ASSERT3U(offset, <, inline_size); | |
655 | eip->zei_bits_set[offset] = set; | |
656 | eip->zei_bits_cleared[offset] = cleared; | |
657 | offset++; | |
658 | } | |
659 | ||
660 | update_histogram(set, eip->zei_histogram_set, | |
661 | &eip->zei_range_sets[range]); | |
662 | update_histogram(cleared, eip->zei_histogram_cleared, | |
663 | &eip->zei_range_clears[range]); | |
664 | } | |
665 | ||
666 | /* convert to byte offsets */ | |
667 | eip->zei_ranges[range].zr_start *= sizeof (uint64_t); | |
668 | eip->zei_ranges[range].zr_end *= sizeof (uint64_t); | |
669 | } | |
84c07ada GN |
670 | |
671 | abd_return_buf((abd_t *)goodabd, (void *)good, size); | |
672 | abd_return_buf((abd_t *)badabd, (void *)bad, size); | |
673 | ||
428870ff BB |
674 | eip->zei_allowed_mingap *= sizeof (uint64_t); |
675 | inline_size *= sizeof (uint64_t); | |
676 | ||
677 | /* fill in ereport */ | |
678 | fm_payload_set(ereport, | |
679 | FM_EREPORT_PAYLOAD_ZFS_BAD_OFFSET_RANGES, | |
680 | DATA_TYPE_UINT32_ARRAY, 2 * eip->zei_range_count, | |
681 | (uint32_t *)eip->zei_ranges, | |
682 | FM_EREPORT_PAYLOAD_ZFS_BAD_RANGE_MIN_GAP, | |
683 | DATA_TYPE_UINT32, eip->zei_allowed_mingap, | |
684 | FM_EREPORT_PAYLOAD_ZFS_BAD_RANGE_SETS, | |
685 | DATA_TYPE_UINT32_ARRAY, eip->zei_range_count, eip->zei_range_sets, | |
686 | FM_EREPORT_PAYLOAD_ZFS_BAD_RANGE_CLEARS, | |
687 | DATA_TYPE_UINT32_ARRAY, eip->zei_range_count, eip->zei_range_clears, | |
688 | NULL); | |
689 | ||
690 | if (!no_inline) { | |
691 | fm_payload_set(ereport, | |
692 | FM_EREPORT_PAYLOAD_ZFS_BAD_SET_BITS, | |
693 | DATA_TYPE_UINT8_ARRAY, | |
694 | inline_size, (uint8_t *)eip->zei_bits_set, | |
695 | FM_EREPORT_PAYLOAD_ZFS_BAD_CLEARED_BITS, | |
696 | DATA_TYPE_UINT8_ARRAY, | |
697 | inline_size, (uint8_t *)eip->zei_bits_cleared, | |
698 | NULL); | |
699 | } else { | |
700 | fm_payload_set(ereport, | |
701 | FM_EREPORT_PAYLOAD_ZFS_BAD_SET_HISTOGRAM, | |
1b18c6d7 | 702 | DATA_TYPE_UINT32_ARRAY, |
428870ff BB |
703 | NBBY * sizeof (uint64_t), eip->zei_histogram_set, |
704 | FM_EREPORT_PAYLOAD_ZFS_BAD_CLEARED_HISTOGRAM, | |
1b18c6d7 | 705 | DATA_TYPE_UINT32_ARRAY, |
428870ff BB |
706 | NBBY * sizeof (uint64_t), eip->zei_histogram_cleared, |
707 | NULL); | |
708 | } | |
709 | return (eip); | |
710 | } | |
711 | #endif | |
712 | ||
ad796b8a TH |
713 | /* |
714 | * Make sure our event is still valid for the given zio/vdev/pool. For example, | |
715 | * we don't want to keep logging events for a faulted or missing vdev. | |
716 | */ | |
717 | boolean_t | |
718 | zfs_ereport_is_valid(const char *subclass, spa_t *spa, vdev_t *vd, zio_t *zio) | |
719 | { | |
720 | #ifdef _KERNEL | |
721 | /* | |
722 | * If we are doing a spa_tryimport() or in recovery mode, | |
723 | * ignore errors. | |
724 | */ | |
725 | if (spa_load_state(spa) == SPA_LOAD_TRYIMPORT || | |
726 | spa_load_state(spa) == SPA_LOAD_RECOVER) | |
727 | return (B_FALSE); | |
728 | ||
729 | /* | |
730 | * If we are in the middle of opening a pool, and the previous attempt | |
731 | * failed, don't bother logging any new ereports - we're just going to | |
732 | * get the same diagnosis anyway. | |
733 | */ | |
734 | if (spa_load_state(spa) != SPA_LOAD_NONE && | |
735 | spa->spa_last_open_failed) | |
736 | return (B_FALSE); | |
737 | ||
738 | if (zio != NULL) { | |
739 | /* | |
740 | * If this is not a read or write zio, ignore the error. This | |
741 | * can occur if the DKIOCFLUSHWRITECACHE ioctl fails. | |
742 | */ | |
743 | if (zio->io_type != ZIO_TYPE_READ && | |
744 | zio->io_type != ZIO_TYPE_WRITE) | |
745 | return (B_FALSE); | |
746 | ||
747 | if (vd != NULL) { | |
748 | /* | |
749 | * If the vdev has already been marked as failing due | |
750 | * to a failed probe, then ignore any subsequent I/O | |
751 | * errors, as the DE will automatically fault the vdev | |
752 | * on the first such failure. This also catches cases | |
753 | * where vdev_remove_wanted is set and the device has | |
754 | * not yet been asynchronously placed into the REMOVED | |
755 | * state. | |
756 | */ | |
757 | if (zio->io_vd == vd && !vdev_accessible(vd, zio)) | |
758 | return (B_FALSE); | |
759 | ||
760 | /* | |
761 | * Ignore checksum errors for reads from DTL regions of | |
762 | * leaf vdevs. | |
763 | */ | |
764 | if (zio->io_type == ZIO_TYPE_READ && | |
765 | zio->io_error == ECKSUM && | |
766 | vd->vdev_ops->vdev_op_leaf && | |
767 | vdev_dtl_contains(vd, DTL_MISSING, zio->io_txg, 1)) | |
768 | return (B_FALSE); | |
769 | } | |
770 | } | |
771 | ||
772 | /* | |
773 | * For probe failure, we want to avoid posting ereports if we've | |
774 | * already removed the device in the meantime. | |
775 | */ | |
776 | if (vd != NULL && | |
777 | strcmp(subclass, FM_EREPORT_ZFS_PROBE_FAILURE) == 0 && | |
778 | (vd->vdev_remove_wanted || vd->vdev_state == VDEV_STATE_REMOVED)) | |
779 | return (B_FALSE); | |
780 | ||
781 | /* Ignore bogus delay events (like from ioctls or unqueued IOs) */ | |
782 | if ((strcmp(subclass, FM_EREPORT_ZFS_DELAY) == 0) && | |
783 | (zio != NULL) && (!zio->io_timestamp)) { | |
784 | return (B_FALSE); | |
785 | } | |
786 | #endif | |
787 | return (B_TRUE); | |
788 | } | |
789 | ||
790 | /* | |
791 | * Return 0 if event was posted, EINVAL if there was a problem posting it or | |
792 | * EBUSY if the event was rate limited. | |
793 | */ | |
794 | int | |
b5256303 | 795 | zfs_ereport_post(const char *subclass, spa_t *spa, vdev_t *vd, |
a2c2ed1b TC |
796 | const zbookmark_phys_t *zb, zio_t *zio, uint64_t stateoroffset, |
797 | uint64_t size) | |
428870ff | 798 | { |
ad796b8a | 799 | int rc = 0; |
428870ff BB |
800 | #ifdef _KERNEL |
801 | nvlist_t *ereport = NULL; | |
802 | nvlist_t *detector = NULL; | |
803 | ||
17b43f96 | 804 | if (zfs_is_ratelimiting_event(subclass, vd)) |
ad796b8a | 805 | return (SET_ERROR(EBUSY)); |
17b43f96 | 806 | |
ad796b8a TH |
807 | if (!zfs_ereport_start(&ereport, &detector, subclass, spa, vd, |
808 | zb, zio, stateoroffset, size)) | |
809 | return (SET_ERROR(EINVAL)); /* couldn't post event */ | |
428870ff BB |
810 | |
811 | if (ereport == NULL) | |
ad796b8a | 812 | return (SET_ERROR(EINVAL)); |
428870ff | 813 | |
26685276 | 814 | /* Cleanup is handled by the callback function */ |
ad796b8a | 815 | rc = zfs_zevent_post(ereport, detector, zfs_zevent_post_cb); |
34dc7c2f | 816 | #endif |
ad796b8a | 817 | return (rc); |
34dc7c2f BB |
818 | } |
819 | ||
428870ff | 820 | void |
a2c2ed1b | 821 | zfs_ereport_start_checksum(spa_t *spa, vdev_t *vd, const zbookmark_phys_t *zb, |
428870ff BB |
822 | struct zio *zio, uint64_t offset, uint64_t length, void *arg, |
823 | zio_bad_cksum_t *info) | |
824 | { | |
6078881a TH |
825 | zio_cksum_report_t *report; |
826 | ||
6078881a TH |
827 | #ifdef _KERNEL |
828 | if (zfs_is_ratelimiting_event(FM_EREPORT_ZFS_CHECKSUM, vd)) | |
829 | return; | |
830 | #endif | |
831 | ||
832 | report = kmem_zalloc(sizeof (*report), KM_SLEEP); | |
428870ff BB |
833 | |
834 | if (zio->io_vsd != NULL) | |
835 | zio->io_vsd_ops->vsd_cksum_report(zio, report, arg); | |
836 | else | |
837 | zio_vsd_default_cksum_report(zio, report, arg); | |
838 | ||
839 | /* copy the checksum failure information if it was provided */ | |
840 | if (info != NULL) { | |
79c76d5b | 841 | report->zcr_ckinfo = kmem_zalloc(sizeof (*info), KM_SLEEP); |
428870ff BB |
842 | bcopy(info, report->zcr_ckinfo, sizeof (*info)); |
843 | } | |
844 | ||
845 | report->zcr_align = 1ULL << vd->vdev_top->vdev_ashift; | |
846 | report->zcr_length = length; | |
847 | ||
848 | #ifdef _KERNEL | |
849 | zfs_ereport_start(&report->zcr_ereport, &report->zcr_detector, | |
b5256303 | 850 | FM_EREPORT_ZFS_CHECKSUM, spa, vd, zb, zio, offset, length); |
428870ff BB |
851 | |
852 | if (report->zcr_ereport == NULL) { | |
0426c168 | 853 | zfs_ereport_free_checksum(report); |
428870ff BB |
854 | return; |
855 | } | |
856 | #endif | |
857 | ||
858 | mutex_enter(&spa->spa_errlist_lock); | |
859 | report->zcr_next = zio->io_logical->io_cksum_report; | |
860 | zio->io_logical->io_cksum_report = report; | |
861 | mutex_exit(&spa->spa_errlist_lock); | |
862 | } | |
863 | ||
864 | void | |
84c07ada GN |
865 | zfs_ereport_finish_checksum(zio_cksum_report_t *report, const abd_t *good_data, |
866 | const abd_t *bad_data, boolean_t drop_if_identical) | |
428870ff BB |
867 | { |
868 | #ifdef _KERNEL | |
0426c168 IH |
869 | zfs_ecksum_info_t *info; |
870 | ||
428870ff BB |
871 | info = annotate_ecksum(report->zcr_ereport, report->zcr_ckinfo, |
872 | good_data, bad_data, report->zcr_length, drop_if_identical); | |
428870ff | 873 | if (info != NULL) |
26685276 BB |
874 | zfs_zevent_post(report->zcr_ereport, |
875 | report->zcr_detector, zfs_zevent_post_cb); | |
0426c168 IH |
876 | else |
877 | zfs_zevent_post_cb(report->zcr_ereport, report->zcr_detector); | |
428870ff | 878 | |
428870ff | 879 | report->zcr_ereport = report->zcr_detector = NULL; |
428870ff BB |
880 | if (info != NULL) |
881 | kmem_free(info, sizeof (*info)); | |
882 | #endif | |
883 | } | |
884 | ||
885 | void | |
886 | zfs_ereport_free_checksum(zio_cksum_report_t *rpt) | |
887 | { | |
888 | #ifdef _KERNEL | |
889 | if (rpt->zcr_ereport != NULL) { | |
890 | fm_nvlist_destroy(rpt->zcr_ereport, | |
891 | FM_NVA_FREE); | |
892 | fm_nvlist_destroy(rpt->zcr_detector, | |
893 | FM_NVA_FREE); | |
894 | } | |
895 | #endif | |
896 | rpt->zcr_free(rpt->zcr_cbdata, rpt->zcr_cbinfo); | |
897 | ||
898 | if (rpt->zcr_ckinfo != NULL) | |
899 | kmem_free(rpt->zcr_ckinfo, sizeof (*rpt->zcr_ckinfo)); | |
900 | ||
901 | kmem_free(rpt, sizeof (*rpt)); | |
902 | } | |
903 | ||
428870ff | 904 | |
ad796b8a | 905 | int |
a2c2ed1b | 906 | zfs_ereport_post_checksum(spa_t *spa, vdev_t *vd, const zbookmark_phys_t *zb, |
428870ff | 907 | struct zio *zio, uint64_t offset, uint64_t length, |
84c07ada | 908 | const abd_t *good_data, const abd_t *bad_data, zio_bad_cksum_t *zbc) |
428870ff | 909 | { |
ad796b8a | 910 | int rc = 0; |
428870ff BB |
911 | #ifdef _KERNEL |
912 | nvlist_t *ereport = NULL; | |
913 | nvlist_t *detector = NULL; | |
914 | zfs_ecksum_info_t *info; | |
915 | ||
ad796b8a TH |
916 | if (zfs_is_ratelimiting_event(FM_EREPORT_ZFS_CHECKSUM, vd)) |
917 | return (EBUSY); | |
428870ff | 918 | |
ad796b8a TH |
919 | if (!zfs_ereport_start(&ereport, &detector, FM_EREPORT_ZFS_CHECKSUM, |
920 | spa, vd, zb, zio, offset, length) || (ereport == NULL)) { | |
921 | return (SET_ERROR(EINVAL)); | |
922 | } | |
428870ff BB |
923 | |
924 | info = annotate_ecksum(ereport, zbc, good_data, bad_data, length, | |
925 | B_FALSE); | |
926 | ||
26685276 | 927 | if (info != NULL) { |
ad796b8a | 928 | rc = zfs_zevent_post(ereport, detector, zfs_zevent_post_cb); |
428870ff | 929 | kmem_free(info, sizeof (*info)); |
26685276 | 930 | } |
428870ff | 931 | #endif |
ad796b8a | 932 | return (rc); |
428870ff BB |
933 | } |
934 | ||
12fa0466 DE |
935 | /* |
936 | * The 'sysevent.fs.zfs.*' events are signals posted to notify user space of | |
937 | * change in the pool. All sysevents are listed in sys/sysevent/eventdefs.h | |
938 | * and are designed to be consumed by the ZFS Event Daemon (ZED). For | |
939 | * additional details refer to the zed(8) man page. | |
940 | */ | |
941 | nvlist_t * | |
942 | zfs_event_create(spa_t *spa, vdev_t *vd, const char *type, const char *name, | |
d02ca379 | 943 | nvlist_t *aux) |
34dc7c2f | 944 | { |
12fa0466 | 945 | nvlist_t *resource = NULL; |
34dc7c2f | 946 | #ifdef _KERNEL |
34dc7c2f BB |
947 | char class[64]; |
948 | ||
428870ff | 949 | if (spa_load_state(spa) == SPA_LOAD_TRYIMPORT) |
12fa0466 | 950 | return (NULL); |
428870ff | 951 | |
34dc7c2f | 952 | if ((resource = fm_nvlist_create(NULL)) == NULL) |
12fa0466 | 953 | return (NULL); |
34dc7c2f | 954 | |
fb390aaf | 955 | (void) snprintf(class, sizeof (class), "%s.%s.%s", type, |
34dc7c2f | 956 | ZFS_ERROR_CLASS, name); |
904ea276 BB |
957 | VERIFY0(nvlist_add_uint8(resource, FM_VERSION, FM_RSRC_VERSION)); |
958 | VERIFY0(nvlist_add_string(resource, FM_CLASS, class)); | |
bcdb96a3 C |
959 | VERIFY0(nvlist_add_string(resource, |
960 | FM_EREPORT_PAYLOAD_ZFS_POOL, spa_name(spa))); | |
904ea276 BB |
961 | VERIFY0(nvlist_add_uint64(resource, |
962 | FM_EREPORT_PAYLOAD_ZFS_POOL_GUID, spa_guid(spa))); | |
bcdb96a3 C |
963 | VERIFY0(nvlist_add_uint64(resource, |
964 | FM_EREPORT_PAYLOAD_ZFS_POOL_STATE, spa_state(spa))); | |
904ea276 BB |
965 | VERIFY0(nvlist_add_int32(resource, |
966 | FM_EREPORT_PAYLOAD_ZFS_POOL_CONTEXT, spa_load_state(spa))); | |
967 | ||
26685276 | 968 | if (vd) { |
904ea276 BB |
969 | VERIFY0(nvlist_add_uint64(resource, |
970 | FM_EREPORT_PAYLOAD_ZFS_VDEV_GUID, vd->vdev_guid)); | |
971 | VERIFY0(nvlist_add_uint64(resource, | |
972 | FM_EREPORT_PAYLOAD_ZFS_VDEV_STATE, vd->vdev_state)); | |
fb390aaf HR |
973 | if (vd->vdev_path != NULL) |
974 | VERIFY0(nvlist_add_string(resource, | |
975 | FM_EREPORT_PAYLOAD_ZFS_VDEV_PATH, vd->vdev_path)); | |
976 | if (vd->vdev_devid != NULL) | |
977 | VERIFY0(nvlist_add_string(resource, | |
978 | FM_EREPORT_PAYLOAD_ZFS_VDEV_DEVID, vd->vdev_devid)); | |
979 | if (vd->vdev_fru != NULL) | |
980 | VERIFY0(nvlist_add_string(resource, | |
981 | FM_EREPORT_PAYLOAD_ZFS_VDEV_FRU, vd->vdev_fru)); | |
6568379e TH |
982 | if (vd->vdev_enc_sysfs_path != NULL) |
983 | VERIFY0(nvlist_add_string(resource, | |
984 | FM_EREPORT_PAYLOAD_ZFS_VDEV_ENC_SYSFS_PATH, | |
985 | vd->vdev_enc_sysfs_path)); | |
12fa0466 | 986 | } |
d02ca379 | 987 | |
12fa0466 DE |
988 | /* also copy any optional payload data */ |
989 | if (aux) { | |
990 | nvpair_t *elem = NULL; | |
991 | ||
992 | while ((elem = nvlist_next_nvpair(aux, elem)) != NULL) | |
993 | (void) nvlist_add_nvpair(resource, elem); | |
26685276 | 994 | } |
34dc7c2f | 995 | |
12fa0466 DE |
996 | #endif |
997 | return (resource); | |
998 | } | |
999 | ||
1000 | static void | |
1001 | zfs_post_common(spa_t *spa, vdev_t *vd, const char *type, const char *name, | |
1002 | nvlist_t *aux) | |
1003 | { | |
1004 | #ifdef _KERNEL | |
1005 | nvlist_t *resource; | |
1006 | ||
1007 | resource = zfs_event_create(spa, vd, type, name, aux); | |
1008 | if (resource) | |
1009 | zfs_zevent_post(resource, NULL, zfs_zevent_post_cb); | |
34dc7c2f BB |
1010 | #endif |
1011 | } | |
1012 | ||
34dc7c2f BB |
1013 | /* |
1014 | * The 'resource.fs.zfs.removed' event is an internal signal that the given vdev | |
1015 | * has been removed from the system. This will cause the DE to ignore any | |
1016 | * recent I/O errors, inferring that they are due to the asynchronous device | |
1017 | * removal. | |
1018 | */ | |
1019 | void | |
1020 | zfs_post_remove(spa_t *spa, vdev_t *vd) | |
1021 | { | |
d02ca379 | 1022 | zfs_post_common(spa, vd, FM_RSRC_CLASS, FM_RESOURCE_REMOVED, NULL); |
34dc7c2f BB |
1023 | } |
1024 | ||
1025 | /* | |
1026 | * The 'resource.fs.zfs.autoreplace' event is an internal signal that the pool | |
1027 | * has the 'autoreplace' property set, and therefore any broken vdevs will be | |
1028 | * handled by higher level logic, and no vdev fault should be generated. | |
1029 | */ | |
1030 | void | |
1031 | zfs_post_autoreplace(spa_t *spa, vdev_t *vd) | |
1032 | { | |
d02ca379 | 1033 | zfs_post_common(spa, vd, FM_RSRC_CLASS, FM_RESOURCE_AUTOREPLACE, NULL); |
34dc7c2f | 1034 | } |
428870ff BB |
1035 | |
1036 | /* | |
1037 | * The 'resource.fs.zfs.statechange' event is an internal signal that the | |
1038 | * given vdev has transitioned its state to DEGRADED or HEALTHY. This will | |
1039 | * cause the retire agent to repair any outstanding fault management cases | |
1040 | * open because the device was not found (fault.fs.zfs.device). | |
1041 | */ | |
1042 | void | |
d02ca379 | 1043 | zfs_post_state_change(spa_t *spa, vdev_t *vd, uint64_t laststate) |
428870ff | 1044 | { |
d02ca379 DB |
1045 | #ifdef _KERNEL |
1046 | nvlist_t *aux; | |
1047 | ||
1048 | /* | |
1049 | * Add optional supplemental keys to payload | |
1050 | */ | |
1051 | aux = fm_nvlist_create(NULL); | |
1052 | if (vd && aux) { | |
1053 | if (vd->vdev_physpath) { | |
1054 | (void) nvlist_add_string(aux, | |
1055 | FM_EREPORT_PAYLOAD_ZFS_VDEV_PHYSPATH, | |
1056 | vd->vdev_physpath); | |
1057 | } | |
1bbd8770 TH |
1058 | if (vd->vdev_enc_sysfs_path) { |
1059 | (void) nvlist_add_string(aux, | |
1060 | FM_EREPORT_PAYLOAD_ZFS_VDEV_ENC_SYSFS_PATH, | |
1061 | vd->vdev_enc_sysfs_path); | |
1062 | } | |
1063 | ||
d02ca379 DB |
1064 | (void) nvlist_add_uint64(aux, |
1065 | FM_EREPORT_PAYLOAD_ZFS_VDEV_LASTSTATE, laststate); | |
1066 | } | |
1067 | ||
1068 | zfs_post_common(spa, vd, FM_RSRC_CLASS, FM_RESOURCE_STATECHANGE, | |
1069 | aux); | |
1070 | ||
1071 | if (aux) | |
1072 | fm_nvlist_destroy(aux, FM_NVA_FREE); | |
1073 | #endif | |
fb390aaf HR |
1074 | } |
1075 | ||
93ce2b4c | 1076 | #if defined(_KERNEL) |
26685276 | 1077 | EXPORT_SYMBOL(zfs_ereport_post); |
ad796b8a | 1078 | EXPORT_SYMBOL(zfs_ereport_is_valid); |
26685276 BB |
1079 | EXPORT_SYMBOL(zfs_ereport_post_checksum); |
1080 | EXPORT_SYMBOL(zfs_post_remove); | |
1081 | EXPORT_SYMBOL(zfs_post_autoreplace); | |
1082 | EXPORT_SYMBOL(zfs_post_state_change); | |
1083 | #endif /* _KERNEL */ |